#from lxml.html import fromstring from lxml.etree import tostring from lxml import etree import sys import datetime import HTMLParser from StringIO import StringIO unescape = HTMLParser.HTMLParser().unescape def fromstring(x): parser = etree.HTMLParser() return etree.parse(StringIO(x), parser) def fromstring(x): parser = etree.HTMLParser() return etree.parse(StringIO(x), parser) def is_not_annotation(x): return not (x.tag == 'span' and x.get('class') == 'annotations') def to_timestamp(x): t0 = datetime.datetime(1970, 1, 1) return int(round((datetime.datetime.strptime(x, '%a, %d %b %Y %H:%M:%S %Z') - t0).total_seconds())) def scrape(x): index = x.get('id') first_link = x[0] time = to_timestamp(first_link.get('title')) rest = x[1:] rest = filter(is_not_annotation, rest) voice = first_link.text if first_link.tail: content = first_link.tail for y in rest: if y.tag == 'a': href = y.get('href') if not y.text.startswith('http') and not y.text.startswith('