#!/usr/bin/env python
# -*- coding: iso-8859-15 -*-
# KMB 2008-12-28

import re
import cPickle
from tkinter_app_00 import App,win

# ./sawyer.py 'tur\s+&aelig;t\s' 'tur\s+ae?t\s'
# ./sawyer.py 'tur\s+on\s'
# ./sawyer.py 'dicitur\s+[aeio]n?\s' 'nominatur\s+[aeio]n?\s' 'nuncupan?tur\s+[aeio]n?\s' 'appellatur\s+[aeio]n?\s' ' cognominatur\s+[aeio]n?\s'

html_to_ascii_table={
  '&aacute;':'á', '&agrave;':'à', '&auml;'  :'ä', '&acirc;' :'â',
  '&#xe5;'  :'å', '&Aring;' :'Å', '&Auml;'  :'Ä', '&Aacute;':'Á',
  '&aelig;' :'æ', '&AELIG;' :'Æ', '&AElig;' :'Æ',
  '&ccedil;':'ç',
  '&eth;'   :'ð', '&ETH;'   :'Ð',
  '&egrave;':'è', '&Egrave;':'È', '&eacute;':'é',
  '&iacute;':'í',
  '&oslash;':'ø', '&oacute;':'ó', '&Oacute;':'Ó', '&Ouml;'  :'Ö', 
  '&ouml;'  :'ö',
  '&thorn;' :'þ', '&THORN;' :'Þ',
  '&uuml;'  :'ü',
  '&yacute;':'ý',
}

def html_to_ascii(t):
  for x in html_to_ascii_table:
    t=t.replace(x,html_to_ascii_table[x])
  return t

def to_ascii(t):
  t=t.replace('{ae}','æ')
  t=t.replace('{AE}','Æ')
  t=t.replace('{dh}','ð')
  t=t.replace('{DH}','Ð')
  t=t.replace('{th}','þ')
  t=t.replace('{TH}','Þ')
  t=t.replace('{&}','&')
  return t

def load_pkl():
  global sawyer
  try:
    pkl=open('sawyer.pkl','rb')
    sawyer=cPickle.load(pkl)
    pkl.close()
  except:
    if win:
      raw_input('Could not load sawyer.pkl - press any key to quit.')
    return 'sawyer.pkl load failure'
  
def Scmp(a,b): # chop 'html/S' and '.html' 
  return cmp(int(a[6:-5]),int(b[6:-5]))

def re_find(p,z):
  r=[]
  for i,w in enumerate(z):
    m=p.match(w)
    if m: r.append(i)
  return tuple(r)

def find(word,cb=False):
  if not word: return ''
  re_clean_html_sup=re.compile(r'<sup>.*?</sup>')
  re_date=re.compile(r'A\.D\.\s+(?P<AD>\d{3,4})')
  rgx=re.compile(('(?i)','')[cb]+word)
  k=0
  r=''
  for s in sawyer:
    fn='html/S%d.html'%s
    txt=sawyer[s]
    txt=re_clean_html_sup.sub('',txt)
    txt=txt.replace('<p>','').replace('</p>','').replace('<I>','').replace('</I>','').replace('\n',' ')
    date=''
    m=re_date.search(txt)
    if m: date=m.group('AD')
    if fn=='html/S1539.html': date='x-xi'
    ms=rgx.finditer(txt)
    for m in ms:
      try:
        q='S%s'%(fn[6:-5],)
        if date: q+=' (%s)'%date
      except:
        q='S%s'%(fn[6:-5],)
        if date: q+=' (%s)'%date
      s,e=m.start(),m.end()
      a,b=s-30,e+15
      while txt[a]!=' ': a-=1
      while txt[b]!=' ': b+=1
      q+=txt[a:b]
      q=q.strip(' .,7')
      r+=to_ascii(q)+'\n'
      k+=1
  if r:
    return r,to_ascii(word)
  return 'No match.',''

if __name__=='__main__':
  app=App(title='Sawyer - OE charter corpus search by Keith Briggs',init=load_pkl,action=find,checkbutton='case sensitive',help='help is available at http://keithbriggs.info/search_gui.html.\n')
  app.mainloop()
