#from lxml.html import fromstring
from lxml.etree import tostring
from lxml import etree
import sys

import datetime
import HTMLParser
from StringIO import StringIO

unescape = HTMLParser.HTMLParser().unescape

def fromstring(x):
    parser = etree.HTMLParser()
    return etree.parse(StringIO(x), parser)

def fromstring(x):
    parser = etree.HTMLParser()
    return etree.parse(StringIO(x), parser)

def is_not_annotation(x):
    return not (x.tag == 'span' and x.get('class') == 'annotations')

def to_timestamp(x):

    t0 = datetime.datetime(1970, 1, 1) 
    return int(round((datetime.datetime.strptime(x, '%a, %d %b %Y %H:%M:%S %Z') - t0).total_seconds()))


def scrape(x):
    index = x.get('id')
    first_link = x[0]
    time = to_timestamp(first_link.get('title'))
    rest = x[1:]
    rest = filter(is_not_annotation, rest)
    voice = first_link.text
    if first_link.tail:
        content = first_link.tail

    for y in rest:
        if y.tag == 'a':
            href = y.get('href')
            if not y.text.startswith('http') and not y.text.startswith('<http'): 
                print tostring(y)
                jdjdj
                if href.startswith('http://btcbase.org/log/'):
                    href = href[len('http://btcbase.org/log/'):]
                content += '['+href+']'
                content += '['+y.text+']'
            else:
                content += y.text
        else:    
            content += y.text
        if y.tail:
            content += y.tail
    
    if content.startswith(': '):
        content = content[2:]
        return index, unicode(time), voice, content
    else:
        return index, unicode(time), u'*', voice + content

# select a unicode turd to use as replacement
def find_replacement(text, counter, start):
    rch = unichr(start+counter)
    while rch in text:
        counter = counter + 1
        rch = unichr(start+counter)
    return rch, counter + 1


def scrape_from_files(files,output): 
    start = 0x1F300 #unicode turd table
    counter = 0
    replacements = {}

    for x in files:
        with open(x, "ru") as s:
            text = s.read().decode('utf-8')
        
        # some lines contain control characters
        # the hmtl parser will remove these while parsing
        # so replace the control chars with some unicode turds
        for i in range(31):
            ch = chr(i)
            if ch in text:
                rch, counter = find_replacement(text, counter, start)
                text = text.replace(ch, rch)
                replacements[rch] = ch

        # parse html to a tree
        # go through the tree, the structure is fixed
        # so indexing is OK
        root = fromstring(text)
        body = root.find('.//body')
        lines = body[3:-3]
        log = map(list, map(scrape, lines))

        # remember the turds
        # replace the unicode turds with the original control chars
        if replacements:
            for x in log:
                text = x[-1]
                for rch, ch in replacements.iteritems():
                    text = text.replace(rch, ch)
                x[-1] = text

        # print the lines, encoding in utf-8
        for x in log:
            line = u";".join(x)
            output.write(line.encode('utf-8'))
            output.write('\n')

if __name__ == "__main__":
    files = sys.argv[1:]
    scrape_from_files(files, sys.stdout)