# Keith Briggs 2017-08-03
# coding=utf8

from sys import stderr,path,exit
import re
from regnal_year_03 import parse_regnal_year_spec,get_regnalyear,int_to_roman

wordy_expander={
  '':      '',
  '?':     'perhaps ',
  'p':     'probably ',
  '<':     'before ',
  '>':     'after ',
  'circa': 'circa ',
  'c':     'circa ',
  'C':     'circa ',
  'nd':    'no date, ',
  'n.d.':  'no date, ',
  'E':     'early ',
  'M':     'middle of the ',
  'L':     'late ',
  'EM':    'early to middle ',
  'ML':    'middle to late ',
  '1H':    'first half of the ',
  '2H':    'second half of the ',
  '1T':    'first third of the ',
  '2T':    'middle third of the ',
  '3T':    'last third of the ',
  '1Q':    'first quarter of the ',
  '2Q':    'second quarter of the ',
  '3Q':    'third quarter of the ',
  '4Q':    'fourth quarter of the ',
}

eml_shift={
  'E':   33.0,
  'M':   66.0,
  'L':  100.0,
  'EM':  75.0,
  'ML': 100.0,
  '1H':  50.0,
  '2H': 100.0,
  '1T':  33.0,
  '2T':  66.0,
  '3T': 100.0,
  '1Q':  25.0,
  '2Q':  50.0,
  '3Q':  75.0,
  '4Q': 100.0,
}

nodate     ='(nd)|(n\.\s?d\.)|(no\s?date)'
prenote    ='(?P<prenote>(%s)|(\[(.*?)\]))?'%nodate
postnote   ='(\[(?P<postnote>.*?)\])?'
circa      ='(?P<circa%d>(c\.?)|(circa))'
uncertain  ='(?P<uncertain%d>\?|p)'
ba         ='(?P<ba%d>[<>])' # before or after
half       ='[12]h'
third      ='[123]t'
quarter    ='[1234]q'
eml        ='(em)|(ml)|[eml]'
prefix     ='(?P<prefix%s>(%s)|(%s)|(%s)|(%s))'%('%d',half,third,quarter,eml,)
simplerange='(?P<simplerange>(1\d\d\d[-]\d)|(1\d\d\d[-]\d\d)$)' # post-1000 only
century    ='(?P<century%d>\d\d?)[Cc]'
decade     ='(?P<decade%d>\d{2,3}0)s'
year       ='(?P<year%d>\d{1,4})'
oldstyle   ='(?P<oldstyle%d>(\d{3}[012345678]/\d)|(\d{3}9/\d{2}))'

first_dict={
  'uncertain': uncertain%0,
  'ba':        ba%0,
  'circa':     circa%0,
  'prefix':    prefix%0,
  'year':      year%0,
  'simplerange': simplerange,
  'century':   century%0,
  'decade':    decade%0,
  'oldstyle':  oldstyle%0,
}
second_dict={
  'uncertain': uncertain%1,
  'ba':        ba%1,
  'circa':     circa%1,
  'prefix':    prefix%1,
  'year':      year%1,
  'century':   century%1,
  'decade':    decade%1,
  'oldstyle':  oldstyle%1,
}
first ='{uncertain}?{ba}?{circa}?{prefix}?({simplerange}|{oldstyle}|{decade}|{year}|{century})'.format(**first_dict)
second='{uncertain}?{ba}?{circa}?{prefix}?({oldstyle}|{decade}|{year}|{century})'.format(**second_dict)
second='((?P<rangesep>([x-])|([-–]{2}))'+second+')?' # note en dash here
dd=prenote+first+second+postnote+'$'
print(dd)
re_dd=re.compile(dd,flags=re.IGNORECASE)
re_spaces=re.compile(r'(\s{2,})')
re_circa=re.compile(r'c.(.*?)(?=$|-|x)') # lookahead assertion

def simplerange_to_float(x):
  a,b=x.split('-')
  if len(b)==1: return float(a[:3]+b)
  return float(a[:2]+b)

class DDD:
  ' document date descriptor '
  def __init__(s,dd,verbose=False):
    s.verbose=verbose
    s.clean=''
    s.wordy=''
    s.sortkey=-1.0 # indicates not yet defined
    if not dd: # empty input
      s.ok=False; return
    s.dd=dd # save exact input
    s.regnalyear=None,None,None
    dds=dd.strip()
    # special case: regnal year (these cannot be uncertain)...
    ry=parse_regnal_year_spec(dds)
    if ry:
      if ry=='error':
        s.ok=False
        s.sortkey=-1.0
        s.wordy=s.clean='DDDFIXME(%s)'%dd
        return
      if s.verbose: print('ry="%s"'%(ry,))
      s.ok=True
      s.regnalyear=ry
      year,monarch=ry[0],ry[1]+' '+str(ry[2])
      actual_year=get_regnalyear(year,monarch)
      if s.verbose: print('year="%s", monarch="%s", actual_year="%s"'%(year,monarch,actual_year,))
      if s.verbose: print('int_to_roman(ry[2])="%s"'%(int_to_roman(ry[2])))
      monarch_name=(monarch.split())[0]
      s.clean=str(year)+' '+monarch_name+' '+int_to_roman(ry[2])
      if s.verbose: print('s.clean="%s"'%(s.clean,))
      s.wordy=s.clean+' (%s)'%actual_year
      s.latex=s.clean+' (%s)'%actual_year
      s.latex=s.latex.replace('<','$<$').replace('>','$>$')
      if '/' in actual_year:
        s.oldstyle=actual_year
        x,y=actual_year.split('/')
        s.sortkey=float(x)+1.0
      else:
        s.sortkey=float(actual_year)
      return
    # general case...
    sortkey_offset0=sortkey_offset1=0.0
    m=re_dd.match(dds)
    if not m:
      s.ok=False
      return
    s.ok=True
    year0=None
    nodate=prenote=postnote=c_rangesep=rangesep=''
    ba0=circa0=prefix0=year0=decade0=century0=oldstyle0=uncertain0=''
    ba1=circa1=prefix1=year1=decade1=century1=oldstyle1=uncertain1=''
    c_ba0=c_circa0=c_prefix0=c_year0=c_decade0=c_century0=c_oldstyle0=c_uncertain0=''
    c_ba1=c_circa1=c_prefix1=c_year1=c_decade1=c_century1=c_oldstyle1=c_uncertain1=''
    eml0=eml1=False
    d=m.groupdict()
    # enforce scanning of keys in desired order...
    keys=('prenote','uncertain0','ba0','circa0','prefix0','year0','simplerange','century0','decade0','oldstyle0','rangesep','uncertain1','ba1','circa1','prefix1','year1','century1','decade1','oldstyle1','postnote',)
    for key in keys:
      if key not in d: continue
      value=d[key]
      if value is None: continue
      if s.verbose: print('key="%s" value="%s"'%(key,value,))
      if key=='simplerange':
        c_year0=year0=value
        s.sortkey=simplerange_to_float(value)
        break
      if key=='prenote':
        prenote=value.strip('[]')
        if prenote=='nd' or prenote=='n.d.' or prenote=='no date':
          nodate='n.d.'
          prenote=''
        continue
      if key=='rangesep':
        c_rangesep=value
        rangesep=' to '
        continue
      if key=='postnote':
        postnote=value.strip('[]')
        continue
      if key=='ba0':
        c_ba0=value
        ba0=wordy_expander[value]
        continue
      if key=='ba1':
        c_ba1=value
        ba1=wordy_expander[value]
        continue
      if key=='circa0':
        c_circa0='c.'
        circa0='circa '
        continue
      if key=='circa1':
        c_circa1='c.'
        circa1='circa '
        continue
      if key=='uncertain0':
        c_uncertain0=value
        uncertain0={'?': 'perhaps ','p': 'probably'}[value]
        continue
      if key=='uncertain1':
        c_uncertain1=value
        uncertain1={'?': 'perhaps ','p': 'probably'}[value]
        continue
      if key=='prefix0':
        c_prefix0=value.lower()
        prefix0=wordy_expander[value.upper()]
        sortkey_offset0=eml_shift[value.upper()]
        if c_prefix0 in ('e','m','l','em','ml'): eml0=True
        continue
      if key=='prefix1':
        c_prefix1=value.lower()
        prefix1=' the '+wordy_expander[value.upper()]
        sortkey_offset1=eml_shift[value.upper()]
        if c_prefix1 in ('e','m','l','em','ml'): eml1=True
        continue
      if key=='decade0':
        c_decade0=decade0=value+'s'
        s.sortkey=float(value)+10.0
        if eml0:
          sortkey_offset0/=10.0
          sortkey_offset0-=10.0
        #print('#',key,value,s.sortkey,sortkey_offset0)
        continue
      if key=='decade1':
        s.sortkey=float(value)+10.0
        c_decade1=decade1=value+'s'
        if eml1:
          sortkey_offset1/=10.0
          sortkey_offset1-=10.0
        continue
      if key=='century0':
        s.sortkey=100.0*float(value)-100.0
        c_century0='%sC'%value
        century0=' %sth century'%value
        continue
      if key=='century1':
        s.sortkey=100.0*float(value)-100.0
        c_century1='%sC'%value
        century1=' %sth century'%value
        continue
      if key=='year0':
        c_year0=year0=value
        s.sortkey=float(year0)
        continue
      if key=='year1':
        c_year1=year1=value
        s.sortkey=float(year1)
        continue
      if key=='oldstyle0':
        c_oldstyle0=oldstyle0=value
        i=value.index('/')
        s.sortkey=float(value[:i])+1.0
        continue
      if key=='oldstyle1':
        c_oldstyle1=oldstyle1=value
        i=value.index('/')
        s.sortkey=float(value[:i])+1.0
        continue
    if sortkey_offset1:
      s.sortkey+=sortkey_offset1
    else:
      s.sortkey+=sortkey_offset0
    clean=' '.join((nodate,c_uncertain0,c_ba0,c_circa0,c_prefix0,c_year0,c_century0,c_decade0,c_oldstyle0,c_rangesep,c_uncertain1,c_ba1,c_circa1,c_prefix1,c_year1,c_century1,c_decade1,c_oldstyle1,))
    wordy=' '.join((wordy_expander[nodate],prenote,uncertain0,ba0,circa0,prefix0,year0,century0,decade0,oldstyle0,rangesep,uncertain1,ba1,circa1,prefix1,year1,century1,decade1,oldstyle1,postnote,))
    clean=clean.replace('-',u'–') # en dash
    wordy=re_spaces.sub(' ',wordy)
    s.wordy=wordy.replace(' ,',',').strip()
    if prenote: prenote='['+prenote+']'
    if postnote: postnote='['+postnote+']'
    s.clean=(prenote+clean.replace(' ','')+postnote).strip().replace(' ,',',')
    s.latex=s.clean
    if 'c.' in s.latex: #  \circa
      s.latex,k=re_circa.subn(r'\circa{\1}',s.latex,count=2)
    if '--' not in s.latex: s.latex=s.latex.replace('-','--')
    s.latex=s.latex.replace(u'–','--')
    s.latex=s.latex.replace('<','$<$').replace('>','$>$').replace('x',r'$\times$')
  def get_ok(s):
    return s.ok
  def get_sortkey(s):
    return s.sortkey
  def get_clean(s):
    return s.clean
  def get_wordy(s):
    return s.wordy
  def get_latex(s):
    return s.latex
  def __repr__(s):
    return "DDD('%s')"%(s.dd,)
  def __str__(s):
    return "DD('%s')"%(s.dd,)
  def __lt__(s,t):
    return s.sortkey<t.sortkey

def test_01(fn,grammar_html):
  html=open('date_descriptor_examples.html','w')
  print('%-20s\t%4s\t%18s\t%-22s\t%s'%('input','sort','normalized output','LaTeX output','verbose output',))
  html.write(r'<h4>The full DDD grammar as a python regular expresssion</h4>'+'\n')
  html.write(r'<p><tt>%s</tt></p>'%grammar_html+'\n')
  html.write(r'<h4>The test cases</h4>'+'\n')
  html.write(r'<p>Below is the ouput from running the python code on my standard set of test cases <a href="DDD_test_cases.txt">DDD_test_cases.txt</a>.  Here the first column is input, and the next three columns are output from the python code.  The normalized output is intended to appear in publications derived from the input; this then ensures consistency of layout and formatting.  Note that an ordinary hyphen (-) for a range is converted to an en-dash (&ndash;) in the normalized output.</p>'+'\n')
  html.write('<pre>\n%-26s\t%4s\t%-22s\t%s\n'%('input','sort','normalized output','verbose output',))
  print('-'*120)
  f=open(fn,'r')
  for line in f:
    if '#'==line[0]:
       html.write('\n%s\n'%line[:-1])
       continue
    dd=DDD(line.strip('\n'))
    if not dd.get_ok():
      print('"%s" failed'%dd)
      html.write('"%s" failed\n'%dd)
    else:
      output_line='%-20s\t%4.0f\t%18s\t%-22s\t%s'%(line[:-1],dd.get_sortkey(),dd.get_clean(),dd.get_latex(),dd.get_wordy(),)
      print(output_line)
      html_line='%-26s\t%4.0f\t%-22s\t%s'%(line[:-1],dd.get_sortkey(),dd.get_clean(),dd.get_wordy(),)
      html_line=html_line.replace('<','&lt;').replace('>','&gt;').replace(u'–','&ndash;')
      html.write(html_line+'\n')
      if dd.get_sortkey()<500: exit()
  f.close()
  html.write('</pre>')
  html.close()

if __name__=='__main__':
  grammar_html=dd.replace('\\','&#92;').replace('<','&lt;').replace('>','&gt;').replace(u'–','&ndash;')
  #print(grammar_html)
  test_01('DDD_test_cases.txt',grammar_html)