#!/usr/bin/env python
# -*- coding: iso-8859-15 -*-
# K M Briggs 2009-02-03 10:45 

import re
from sys import stderr,exit,argv
import cPickle
from os.path import expanduser
from tkinter_app_00 import App,win

if win:
  corpus_path='./0163/'
else:
  corpus_path='~/0163/'

fixes={
  '&d;':   '{dh}',
  '&t;':   '{th}',
  '&D;':   '{DH}',
  '&T;':   '{TH}',
  '&ae;':  '{ae}',
  '&AE;':  '{AE}',
  '&amp;': '{&}',
  '&ouml;': '{oe}' # '\xc3',
}

def fix(t):
  for f in fixes:
    t=t.replace(f,fixes[f])
  return t

# <tei.2 id="T14060">
re_tei_2=re.compile(r'<tei.2\s+id="(?P<id>T\d{5})">(?P<txt>.*?)</tei.2>',re.DOTALL|re.MULTILINE)
# <s id="T17350000600" n="6"> &T;onon to cuntan heale.</s>
re_s=re.compile(r'<s\s+id="(?P<id>T\d{11})"\s+n="(?P<n>\d+(\.\d+)?)">(?P<txt>.*?)</s>',re.DOTALL|re.MULTILINE)
# <sourcedesc><bibl>Bounds, Sawyer 1: Birch 1885-99, no. 3</bibl></sourcedesc>
re_sourcedesc=re.compile(r'<sourcedesc>\s*<bibl>\s*(?P<sd>.*?)\s*</bibl>\s*</sourcedesc>',re.DOTALL|re.MULTILINE)

def get_tei2_dictionary():
  try:
    pkl=open('text_dictionary.pkl','rb')
    tei2_d=cPickle.load(pkl)
    pkl.close()
    return tei2_d
  except:
    print>>stderr,'building text_dictionary.pkl...',
    t=''
    for n in range(1,78):
      print>>stderr,'seg%02d'%n,
      f=open(expanduser(corpus_path+'/oecorp/seg%02d'%n),'r')
      t+=f.read()
      f.close()
    print>>stderr
    d={}
    tei2_d={}
    tei_2s=re_tei_2.finditer(t)
    for tei_2 in tei_2s: # for each text
      t=tei_2.group('txt')
      ms=re_s.finditer(t)
      sd=re_sourcedesc.search(t).group('sd')
      full_text=[]
      for m in ms: # for each line in the text
        txt=m.group('txt')
        id=m.group('id')
        n=m.group('n')
        txt=txt.replace('<corr>','[|').replace('</corr>','|]')
        txt=txt.replace('<foreign>','[[').replace('</foreign>',']]')
        txt=txt.replace('\n',' ')
        txt=fix(txt)
        full_text.append(txt)
        for word in txt.split():
          if word[0]=='[': continue
          word=word.rstrip(',.')
          d.setdefault(word,[]).append((id,n))
      ft=''.join(full_text).strip()
      tei2_d[tei_2.group('id')]=ft
    pkl=open('text_dictionary.pkl','wb')
    cPickle.dump(tei2_d,pkl,-1)
    pkl.close()
    return tei2_d

def get_desc_dictionary():
  fn='desc_dictionary.pkl'
  try:
    pkl=open(fn,'rb')
    d=cPickle.load(pkl)
    pkl.close()
    return d
  except:
    print>>stderr,'building %s...'%fn,
    t=''
    for n in range(1,78):
      print>>stderr,'seg%02d'%n,
      f=open(expanduser(corpus_path+'/oecorp/seg%02d'%n),'r')
      t+=f.read()
      f.close()
    d={}
    tei_2s=re_tei_2.finditer(t)
    for tei_2 in tei_2s: # for each text
      sd=re_sourcedesc.search(tei_2.group(0)).group('sd')
      d[tei_2.group('id')]=fix(sd)
    pkl=open(fn,'wb')
    cPickle.dump(d,pkl,-1)
    pkl.close()
    return d

def to_ascii(t):
  t=t.replace('{oe}','ö')
  t=t.replace('{ae}','æ')
  t=t.replace('{AE}','Æ')
  t=t.replace('{dh}','ð')
  t=t.replace('{DH}','Ð')
  t=t.replace('{th}','þ')
  t=t.replace('{TH}','Þ')
  t=t.replace('{&}','&')
  return t

def context(txt,s,e,span=15):
  n=len(txt)
  a,b=max(0,s-span),min(e+span,n)
  while a>=0 and txt[a]!=' ': a-=1
  while b<n  and txt[b]!=' ': b+=1
  q=txt[a:b]
  q=q.strip(' .,7')
  q=to_ascii(q)
  return q,a,b

def search(regex,cb=False,v=False):
  if not regex: return '',''
  rgx=re.compile(('(?i)','')[cb]+'('+regex+')')
  tei2_with_match=set([])
  keys=tei2_d.keys()
  keys.sort()
  r=''
  for tei2 in keys:
    txt=tei2_d[tei2]
    ms=rgx.finditer(txt)
    tag=tei2
    for m in ms:
      tei2_with_match.add(tei2)
      s,e=m.start(),m.end()
      cntxt,a,b=context(txt,s,e)
      i=max(0,cntxt.find(txt[s:e]))
      j=0
      for special in ('æ','Æ','ð','Ð','þ','Þ'):
        j+=cntxt[:i].count(special)
      r+='%6s %s'%(tag,cntxt,)+'\n'
  ts=list(tei2_with_match)
  ts.sort()
  for t in ts:
    r+='  %s = %s\n'%(t,to_ascii(desc_dictionary[t]))
  if not r: r='No match to '+regex
  return r,regex

def init():
  global tei2_d,desc_dictionary
  tei2_d=get_tei2_dictionary()
  desc_dictionary=get_desc_dictionary()

if __name__=='__main__':
  app=App(title='COE - OE corpus search by Keith Briggs',init=init,action=search,checkbutton='case sensitive',help='help is available at http://keithbriggs.info/search_gui.html.\n [[...]] refers to foreign words (usually Latin).')
  app.mainloop()
