#from lxml.html import fromstring
from lxml.etree import tostring
from lxml import etree
import sys
import datetime
import HTMLParser
from StringIO import StringIO
unescape = HTMLParser.HTMLParser().unescape
def fromstring(x):
parser = etree.HTMLParser()
return etree.parse(StringIO(x), parser)
def fromstring(x):
parser = etree.HTMLParser()
return etree.parse(StringIO(x), parser)
def is_not_annotation(x):
return not (x.tag == 'span' and x.get('class') == 'annotations')
def to_timestamp(x):
t0 = datetime.datetime(1970, 1, 1)
return int(round((datetime.datetime.strptime(x, '%a, %d %b %Y %H:%M:%S %Z') - t0).total_seconds()))
def scrape(x):
index = x.get('id')
first_link = x[0]
time = to_timestamp(first_link.get('title'))
rest = x[1:]
rest = filter(is_not_annotation, rest)
voice = first_link.text
if first_link.tail:
content = first_link.tail
for y in rest:
if y.tag == 'a':
href = y.get('href')
if not y.text.startswith('http') and not y.text.startswith('