Benutzer:Griot/analyse-datum.py
Zur Navigation springen
Zur Suche springen
- ! /usr/bin/env python
- -*- coding: iso-8859-15 -*-
<?python
'''
# '''
# Das folgende Programm 'analyse_datum.py' ist noch ganz unbefriedigend, wohl aber bereits arbeitsfähig.
# Wird es in einem linux-System, in dem Python (Version <3.0) installiert ist,
# in einer der folgenden Arten aufgerufen:
# python analyse_datum.py infile
# python analyse_datum.py infile outfile
# python analyse_datum.py infile >outfile
# wo 'infile' ein File ist, welches u.a. die zu untersuchenden Zeilen GEBURTSDATUM/STERBEDATUM enthält,
# gibt es inkorrekte derartige Zeilen aus.
#
# ('infile' kann ein ganzes Dumpfile sein, das Programm ist dann allerdings nicht sehr schnell.
# [Wenn infile nur die Personenartikel enthält, benötigt das Programm
# auf einem 5 Jahre alten Durchschnitts-PC etwa 9 Minuten.]
# Zudem werden auch Zeilen, die formal wie Zeilen von Personendaten aussehen, behandelt und eventuell bemängelt.)
# Als Optionsparameter kann -0, -1, ..., -10 angegeben werden, also etwa
# python analyse_datum.py -5 infile
# Dadurch wird die Interpretation von 'um' gesteuert.
# Je kleiner der Optionswert (0, ..., 10), umso kleiner das sich ergebende Intervall.
# Das angegebene '-5' etwa bedeutet:
# 'um 12. Jahrhundert' wird interpretiert als 'zwischen 1071 und 1230' (+/- 30 Jahre)
# 'um 1250' wird interpretiert als 'zwischen 1247 und 1253' (+/- 3 Jahre)
# 'um Mai 1250' wird interpretiert als 'zwischen April 1250 und Juni 1250' (+/- 1 Monat)
# 'um 15. Mai 1250' wird interpretiert als 'zwischen 5. Mai 1250 und 25. Mai 1250' (+/- 10 Tage)
# Der default-Wert entspricht '-7'. Das heißt
# +/- 1 Jahrhundert, +/- 10 Jahre, +/- 3 Monate, +/- 1 Monat, je nach der Genauigkeit der Zeitangabe.
# (Für andere Werte siehe 'um_widening' im Programmtext.)
# Das Programm enthält einen schweren Entwurfsfehler, der vor einer Funktionserweiterung
# (etwa zur Bestimmung des gesamten Zeitintervals, zwecks Abgleich mit den Kategorien)
# beseitigt werden sollte: die frühzeitige Interpretation von 'um', 'nach', 'vor',
# sofort bei Erkennung dieser Formen.
# Die Interpretation sollte verschoben werden, bis die gesamten Angaben bearbeitet wurden.
# Dazu ist allerdings noch eine Erweiterung der benutzten Datenstrukturen erforderlich.
################################################################
# version 01: quick-and-dirty changes, to handle the changed formats from february 2018.
# outdated lines (mostly) aren't deleted, but commented out by '#-#'
# done:
# 1) lowest date changed (for "Leanderthal lady")
# 2) no linking of centuries anymore
# 3) 'um'-clauses in 'vor'-, 'nach'-, 'zwischen'-clauses
# 4) only "–" in a range, not "/"
# 5) no n.–m. Jahrhundert anymore
################################################################
#
# Es folgt das eigentliche Programm (jedoch ist auch dieses gesamte File arbeitsfähig):
# -------------------------------------------------------------------------------------
#! /usr/bin/env python
# -*- coding: iso-8859-15 -*-
import sys, re, time
################################################################
################################################################
# class DRS (common of Date, Range, Sieve)
#
# The class represents properties, common to
# Date, Range, and Sieve.
########################################################
class DRS:
# ======
def lo_extrem( self ): return self. lo(). lo_extrem()
def hi_extrem( self ): return self. hi(). hi_extrem()
# ======
def is_empty ( self ): return ( self. lo_extrem() > self. hi_extrem() )
# ======
def overlap( self, u ):
assert isinstance( u, DRS )
mi = max( self. lo_extrem(), u. lo_extrem() )
ma = min( self. hi_extrem(), u. hi_extrem() )
return Range( ( mi, ma ) )
################################################################
################################################################
# class Date
#
# The class represents dates, containing century, year, month and day.
# Some of these informations can be missing from a date.
# (For the representation of the parts, see below.)
# Instantiations are inmutable ones.
#
# 'year' means 'year in its century' here, a 'full' year is named 'fyear'.
########################################################
class Date( DRS ):
CYMD = 4 # number of 'components' of a date
def __init__(self, u=(), **keys):
if isinstance( u, tuple ):
assert len(u) <= CYMD
c, y, m, d = ( u + CYMD * (None,) )[:CYMD]
circa = False
elif isinstance( u, Date ):
c, y, m, d = u. elems()
circa = u. circa()
else:
assert False
if "c" in keys: c = keys. get( "c" )
if "y" in keys: y = keys. get( "y" )
if "m" in keys: m = keys. get( "m" )
if "d" in keys: d = keys. get( "d" )
if "fy" in keys: # fyear: full year (i.e. year incl. century)
# fyear --> c, y
# -199 .. -100 --> -1, 0 .. -1, 99
# -99 .. 0 --> 0, 0 .. 0, 99
# 1 .. 100 --> 1, 0 .. 1, 99
fyear = keys. get( "fy" )
c = (fyear - 1) // 100 + 1
y = (fyear - 1) % 100
if "circa" in keys: circa = keys. get( "circa" )
assert (y != None) <= ( c != None )
self. elems_ = ( c, y, m, d )
self. circa_ = circa
# note defects
self. defects_ = False
return
# ======
def elems ( self ): return self. elems_
def circa ( self ): return self. circa_
########
def defects( self ):
if ( self. defects_ == False ):
self. defects_ = []
c, y, m, d = self. elems()
assert (y == None) or ( year_lo <= y <= year_hi )
assert (m == None) or ( month_lo <= m <= month_hi )
if ( d != None ):
mm = "day number " + extern_day_string( d )
m1 = " invalid"
m2 = " invalid for this month"
m3 = " invalid for this month and year"
if ( d < day_lo ): assert False
elif ( d > day_hi( Date( ) ) ): self. defects_ += [ mm + m1 ]
elif ( d > day_hi( Date( m=m) ) ): self. defects_ += [ mm + m2 ]
elif ( d > day_hi( Date( c=c, y=y, m=m) ) ): self. defects_ += [ mm + m3 ]
fy = self. fyear()
if ( c != None and y != None ) and ( fy < fyear_input_lo ):
mm = ( "year " + extern_fyear_string( fy )
+ " is before the first handled year" )
self. defects_ += [ mm ]
if ( c != None and y == None ) and ( c < cent_input_lo ):
mm = ( "century " + extern_cent_string( c )
+ " is before the first handled century" )
self. defects_ += [ mm ]
if ( c != None ) and ( ultima_input. hi() < self. lo_extrem() ):
mm = "a given date is a future date"
self. defects_ += [ mm ]
if ( d != None ) and in_gap( self ):
mm = "dates 5. to 14. October 1582 aren't existing"
self. defects_ += [ mm ]
return self. defects_
########
def c( self ): return self. elems() [0]
def y( self ): return self. elems() [1]
def m( self ): return self. elems() [2]
def d( self ): return self. elems() [3]
# ======
def fyear( self ):
c, y, m, d = self. elems()
c = (c if c != None else 0) # value unused for c == None
y = (y if y != None else 0)
return 100 * (c - 1) + (y + 1)
########
def lo( self ): return self
def hi( self ): return self
# ======
def lo_extrem( self ):
assert self. is_monotonic()
c, y, m, d = self. elems()
c = c if (c != None) else cent_lo
y = y if (y != None) else year_lo
m = m if (m != None) else month_lo
d = d if (d != None) else day_lo
return Date( (c, y, m, d) )
# ======
def hi_extrem( self ):
assert self. is_monotonic()
c, y, m, d = self. elems()
c = c if (c != None) else cent_hi
y = y if (y != None) else year_hi
m = m if (m != None) else month_hi
d = d if (d != None) else day_hi( Date( self, d=None ) )
return Date( (c, y, m, d) )
########
def __eq__( self, date ):
r = (self. elems() == date. elems()) if isinstance(date, Date) else False
return r
# ======
def __ne__( self, date ):
assert isinstance(date, Date)
return self. elems() != date. elems()
# ======
def __lt__( self, date ):
assert isinstance(date, Date)
assert self. precision() == date. precision()
return self. elems() < date. elems()
# ======
def __gt__( self, date ):
assert isinstance(date, Date)
assert self. precision() == date. precision()
return self. elems() > date. elems()
# ======
def __le__( self, date ):
assert isinstance(date, Date)
prec = min( self. precision(), date. precision() )
return ( self. elems() [:prec] < date. elems() [:prec]
or ( self. elems() [:prec] == date. elems() [:prec]
and self. precision() >= date. precision()
and self. lo_extrem() == date. lo_extrem() ) )
# ======
def __ge__( self, date ):
assert isinstance(date, Date)
prec = min( self. precision(), date. precision() )
return ( self. elems() [:prec] > date. elems() [:prec]
or ( self. elems() [:prec] == date. elems() [:prec]
and self. precision() >= date. precision()
and self. hi_extrem() == date. hi_extrem() ) )
########
def is_valid( self ):
return self. defects() == []
# ======
def precision( self ):
"Counts non-None high-end parts of the date."
n = 0
while ( n < CYMD and self. elems() [n] != None ):
n += 1
return n
# ======
def is_monotonic( self ):
"The non_None high-end parts are followed by None-parts only?"
prec = self. precision()
return self == Date( self. elems() [:prec] )
# ======
def is_date_to_the_day( self ):
return self. precision() == CYMD
########
def month_is_normalized( self ):
c, y, m, d = self. elems()
return (
True # The century value may be unrestricted here.
and ( (y == None) or ( year_lo <= y <= year_hi) )
and ( (m == None) or (month_lo <= m <= month_hi) )
)
# ======
def normalize_month( self ):
assert self. is_monotonic()
w = list( self. elems() )
if w[2] != None:
r, s = divmod( w[2], months_in_year )
w[2] = s
w[1] += r
w = tuple( w )
return Date( w ). normalize_year()
# ======
def normalize_year( self ):
assert self. is_monotonic()
w = list( self. elems() )
if w[1] != None:
r, s = divmod( w[1], years_in_century )
w[1] = s
w[0] += r
w = tuple( w )
return Date( w ). normalize_cent()
# ======
def normalize_cent( self ):
# There's no need to restrict the value on the range ultima.
return self
################################################
def __add__( self, u ):
"Add duration."
########################
def day_number_from_index( u ):
d = u. d()
if d != None:
if pre_gap < u:
d += length_of_gap_in_month( u )
return d
########
def day_index_from_number( u ):
d = u. d()
if d != None:
assert not in_gap( u )
if post_gap <= u:
d -= length_of_gap_in_month( u )
return d
########################
v = u. elems()
assert self. is_valid()
assert self. is_monotonic()
assert all( [ (el != None) for el in v ] )
c, y, m, d = self. elems()
if not all( [ (self_el != None) or (v_el == 0)
for self_el, v_el in zip(self. elems(), v) ] ):
add = all( [ (el >= 0) for el in v ] )
sub = all( [ (el <= 0) for el in v ] )
assert add or sub
if sub:
c, y, m, d = self. lo_extrem() .elems()
else:
c, y, m, d = self. hi_extrem() .elems()
# To add first the centuries-years-months, then the days,
# gives sometimes different results than
# to add first the days, then the centuries-years-months.
# (Example: 4. 10. 1582 + 1 day + 1 month = 15. 11. 1582 [32 days later],
# 4. 10. 1582 + 1 month + 1 day = 5. 11. 1582 [22 days later])
# This big difference results from the anomaly of October 1582,
# smaller differences (maximal 3[?]) from the differences in month lengths.
#
# To add days first seems to give results, which are more 'naturally'.
# add days:
if d != None:
d = day_index_from_number( Date((c, y, m, d)) )
d += v[-1]
while True:
if d < 0:
m -= 1
dat = Date((c, y, m, d)). normalize_month()
n = days_in_month( dat )
c, y, m, d = dat. elems()
d += n
else:
n = days_in_month( Date((c, y, m, d)) )
if n <= d:
m += 1
d -= n
dat = Date((c, y, m, d)). normalize_month()
c, y, m, d = dat. elems()
else: # 0 <= d < n
break
d = day_number_from_index( Date((c, y, m, d)) )
# add centuries-years-months:
w = [ c, y, m, d ]
for i in (0, 1, 2):
if w[i] == None: break
w[i] += v[i]
w = tuple( w )
dat = Date( w ). normalize_month()
# this addition changed the month, so it may have invalidated the day
# case 1: day number above maximum for that month
# (cannot occur for the refoem month with max. day number 31)
n = days_in_month( dat )
c, y, m, d = dat. elems()
if n <= d:
m += 1
d -= n # 0 <= d < 3, so no 2nd correction needed
dat = Date((c, y, m, d)). normalize_month()
# case 2: 'date' falls in the gap of the reform month
if in_gap( dat ):
add = all( [ (el >= 0) for el in v ] )
sub = all( [ (el <= 0) for el in v ] )
assert add or sub
if sub:
dat = pre_gap
else:
dat = post_gap
result = dat
return result
################################################
def __sub__( self, u ):
"Subtract duration."
return self + (-u)
################################################
def restrict_on_ultima( self ):
"Restrict given date value on the range ultima."
w = self. elems()
prec = self. precision()
if self < ultima. lo():
w = ultima. lo(). elems() [:prec]
elif self > ultima. hi():
w = ultima. hi(). elems() [:prec]
return Date( w )
################
# specials
########
CYMD = Date. CYMD
#
date_none = Date()
################################################################
# class Range
#
# The class represents am intervall of dates.
# Instantiations are inmutable ones.
########################################################
class Range( DRS ):
def __init__( self, u ):
if isinstance(u, tuple) and len(u) == 2:
lo = u[0]
hi = u[1]
elif isinstance( u, Date ):
lo = u
hi = u
elif isinstance( u, Range ):
lo = u. lo()
hi = u. hi()
else:
assert False
# The following isinstance-tests result in 'false'
# for clauses with 'um' in a 'zwischen' clause.
# Should get a real handling, but for now:
# assert isinstance( lo, Date ) and lo. is_monotonic()
# assert isinstance( hi, Date ) and hi. is_monotonic()
self. elems_ = ( lo, hi )
# note defects
self. defects_ = False
return
# ======
def elems ( self ): return self. elems_
########
def defects( self ):
if ( self. defects_ == False ):
self. defects_ = []
if self. is_empty():
mm = "range is empty"
self. defects_ += [ mm ]
return self. defects_
################
def lo( self ): return self. elems() [0]
def hi( self ): return self. elems() [1]
# ======
def wide( self, u ):
if isinstance( u, tuple ) and len( u ) == 2:
w_lo, w_hi = u
else:
w_lo = w_hi = u
assert isinstance( w_lo, Duration )
assert isinstance( w_hi, Duration )
return Range( ( self. lo() - w_lo, self. hi() + w_hi ) )
################################################################
# class Sieve
#
# The class represents the 'same' day or month in a range of years.
# Instantiations are inmutable ones.
########################################################
class Sieve( DRS ):
def __init__(self, u, v ):
assert isinstance( u, Range )
assert isinstance( v, Date )
assert (v. c() == v. y() == None) and (v. m() != None)
self. elems_ = ( u, v )
# list of dates:
a, o = u. lo_extrem(), u. hi_extrem()
if (v. d() == None):
a = Date( a, d=None )
o = Date( o, d=None )
self. dates_ = []
fy = a. fyear()
fy -= 1
while True:
fy += 1
trial = Date( fy=fy, m=v. m(), d=v. d() )
if not ( a <= trial ): continue
if not ( trial <= o ): break
if trial. is_valid():
self. dates_ .append( trial )
# note defects
self. defects_ = False
return
# ======
def elems ( self ): return self. elems_
def dates ( self ): return self. dates_
########
def rng ( self ): return self. elems()[ 0 ]
def dat ( self ): return self. elems()[ 1 ]
########
def defects( self ):
if ( self. defects_ == False ):
self. defects_ = []
if self. is_empty():
mm = "sieve-expression describes empty set"
self. defects_ += [ mm ]
l = len( self. dates() )
if ( 1 <= l <= 1 # 2->1
and um_widening()[0] >= year*1 ):
mm = ( "sieve-expression generates "
+ str(l) + " value" + (l > 1)*"s" + " only" )
self. defects_ += [ mm ]
return self. defects_
################
def lo( self ):
if self. dates() == []: r = ultima. hi()
else : r = self. dates() [ 0]
return r
# ======
def hi( self ):
if self. dates() == []: r = ultima. lo()
else : r = self. dates() [-1]
return r
################################################################
# class Or_row
#
# The class represents a row of dates, ranges, and sieves, combined by 'oder'.
# Instantiations are inmutable ones.
########################################################
class Or_row:
def __init__(self, u):
if isinstance( u, list ):
self. elems_ = tuple( u )
else:
assert False
# note defects
self. defects_ = False
return
# ======
def elems ( self ): return self. elems_
########
def defects( self ):
if ( self. defects_ == False ):
self. defects_ = []
if not self. is_ordered():
mm = "oder list isn't ordered"
self. defects_ += [ mm ]
if not self. is_non_overlapping():
mm = "oder list is overlapping"
self. defects_ += [ mm ]
return self. defects_
################
def is_valid( self ):
# Valid, if the order is the time order (of the earliest possible value)
# and if there is no overlapping.
order = self. is_ordered()
non_ov = self. is_non_overlapping()
return order and non_olap
# ======
def is_ordered( self ):
return all( [ self. elems()[j-1]. lo_extrem()
< self. elems()[j ]. lo_extrem()
for j in range( 1, len(self. elems()) ) ] )
# ======
def is_non_overlapping( self ):
non_ov = True
for i in range( len( self. elems() ) ):
for j in range(i+1, len( self. elems() ) ):
eli = self. elems()[i]
elj = self. elems()[j]
nov = eli. overlap( elj ). is_empty()
if not nov:
if ( isinstance( eli, Sieve )
and isinstance( elj, Sieve ) ):
nov = ( eli. dat(). m() != elj. dat(). m()
or ( eli. dat(). d() != elj. dat(). d()
and ( eli. dat(). d() != None
or elj. dat(). d() != None ) ) )
non_ov = non_ov and nov
return non_ov
################################################################
# class Duration
#
# The class represents a duration, counted in days, months, years, centuries.
# Instantiations are inmutable ones.
########################################################
class Duration:
def __init__(self, u ):
if isinstance(u, tuple) and len(u) == CYMD:
self. elems_ = u
else:
assert False
return
def elems( self ): return self. elems_
########
def c( self ): return self. elems() [0]
def y( self ): return self. elems() [1]
def m( self ): return self. elems() [2]
def d( self ): return self. elems() [3]
########
def __eq__( self, dura ):
assert isinstance(dura, Duration)
return self. elems() == dura. elems()
# ======
def __ne__( self, dura ):
assert isinstance(dura, Duration)
return self. elems() != dura. elems()
# ======
def __lt__( self, dura ):
assert isinstance(dura, Duration)
return self. elems() < dura. elems()
# ======
def __gt__( self, dura ):
assert isinstance(dura, Duration)
return self. elems() > dura. elems()
# ======
def __le__( self, dura ):
assert isinstance(dura, Duration)
return ( self. elems() < dura. elems()
or self. elems() == dura. elems() )
# ======
def __ge__( self, dura ):
assert isinstance(dura, Duration)
return ( self. elems() > dura. elems()
or self. elems() == dura. elems() )
########
def __sub__( self, dura ):
assert isinstance( dura, Duration )
r = [ (x - y) for x, y in zip( self. elems(), dura .elems() ) ]
return Duration( tuple( r ) )
def __mul__( self, factor ):
assert isinstance( factor, int )
r = [ el * factor for el in self. elems() ]
return Duration( tuple( r ) )
def __neg__( self ):
return self * (-1)
################
# specials
########
century = Duration( ( 1, 0, 0, 0 ) )
year = Duration( ( 0, 1, 0, 0 ) )
month = Duration( ( 0, 0, 1, 0 ) )
day = Duration( ( 0, 0, 0, 1 ) )
################################################################
def is_atom( x ):
return not ( isinstance( x, Range )
or isinstance( x, Sieve )
or isinstance( x, Or_row ) )
################
def elems( x ):
return ( () if is_atom(x) else x. elems() )
################
def list_elems_depth_first( x ):
r = []
for el in elems( x ):
r += list_elems_depth_first( el )
r += [x]
return r
################################################################
################################################################
#: date_of_birth = [ state_option getauft_option oder_row ]
#: date_of_death = [ state_option begraben_option oder_row ]
################################
# event class
########
Birth = 0
Death = 1
################################
def date_of_birth(): return date_description( Birth )
def date_of_death(): return date_description( Death )
################
def date_description( event_class ):
rr = s = seq( [ lambda: pd_date_option( event_class ), nothing_anymore ] )
#
if (s != False):
u = s[0]
rr = u
#
return rr
################################################################
#: pd_date_option = [ pd_date ]
################################
# result parts
########
state = "state"
event = "event"
dates = "dates"
################################
def pd_date_option( event_class ):
return oneof(
[ lambda: pd_date ( event_class ) # 1
, lambda: empty_pd_date( event_class ) # 2
] )
################
def pd_date( event_class ):
rr = s = seq( [ state_option, lambda: event_option( event_class ), oder_row ] )
#
if (s != False):
u = s[0]; v = s[1]; w = s[2]
rr = { state: u,
event: v,
dates: w
}
#
return rr
################
def empty_pd_date( event_class ):
rr = s = seq( [] )
#
if (s != False):
pass
rr = { state: empty_state_phrase(),
event: empty_event_phrase( event_class ),
dates: empty_oder_row()
}
#
return rr
################################################################
#: state_option = [ unsicher colon b ]
################################
# state
########
certain = "certain"
uncertain = "uncertain"
################################
def state_option():
return oneof(
[ state_phrase # 1
, empty_state_phrase # 2
] )
################
def state_phrase ():
rr = s = seq( [ unsicher, colon, b ] )
#
if (s != False):
pass
rr = uncertain
#
return rr
################
def empty_state_phrase():
rr = s = seq( [] )
#
if (s != False):
pass
rr = certain
#
return rr
################################################################
#: getauft_option = [ getauft b ]
#: begraben_option = [ begraben b ]
################################
# event
########
# event class 'Birth':
birth = "birth"
baptism = "baptism"
# event class 'Death':
death = "death"
burial = "burial"
################################
def event_option( event_class ):
return oneof(
[ lambda: event_phrase ( event_class ) # 1
, lambda: empty_event_phrase( event_class ) # 2
] )
################
def event_phrase ( event_class ):
rr = s = seq( [ getauft if (event_class == Birth) else begraben, b ] )
#
if (s != False):
pass
rr = baptism if (event_class == Birth) else burial
#
return rr
################
def empty_event_phrase( event_class ):
rr = s = seq( [] )
#
if (s != False):
pass
rr = birth if (event_class == Birth) else death
#
return rr
################################################################
#: oder_row = date [ oder_phrase ]...
################################
def oder_row():
rr = s = seq( [ date, lambda: repeat( oder_phrase ) ] )
#
if (s != False):
u = s[0]; v = s[1]
rr = Or_row( [ u ] + v )
#
return rr
################
def empty_oder_row():
rr = s = seq( [] )
#
if (s != False):
pass
rr = Or_row( [ ultima_input ] )
#
return rr
################################################################
#: oder_phrase = b oder b date
################################
def oder_phrase():
rr = s = seq( [ b, oder, b, date ] )
#
if (s != False):
u = s[3]
rr = u
#
return rr
################################################################
#: date = [date_head b] date_tail
################
# i.e.:
# [ [day] month ] year
# [ [day] month ] century
# [ [day] month ] yearpair
# [ [day] month ] vor ...
# [ [day] month ] nach ...
# [ [day] month ] um ...
# [ [day] month ] zwischen ... und ...
# ("day month" may be linked)
# returns one of:
# Date
# Range
# Sieve
################################
def date():
rr = s = seq( [ date_head_option, date_tail ] )
#
if (s != False):
u = s[0]; v = s[1]
assert isinstance( u, Date )
if (u == date_none) and isinstance( v, Date ):
# year
# century -> Date
#
r = v
elif (u == date_none) and isinstance( v, Range ):
# vor/nach/um/zwischen/yearpair -> Range
#
r = v
elif ((u != date_none) and isinstance( v, Date )
and (v. y() != None) ):
# [day] month year -> Date
#
r = Date( u, fy=v. fyear() )
elif ((u != date_none) and isinstance( v, Date )
and (v. y() == None) ):
# [day] month century -> Sieve
#
r = Sieve( Range(v), u )
elif (u != date_none) and isinstance( v, Range ):
# [day] month vor/nach/um/zwischen/yearpair -> Sieve
#
r = Sieve( v, u )
else:
assert False
rr = r
#
return rr
################################################################
#: date_head_option = [ date_head ]
#: date_head = dayM b
################
# i.e.:
# day month
# month
# -
# ("day month" may be linked)
# returns:
# Date (with century == year == None)
################################
def date_head_option():
return oneof(
[ date_head # 1
, empty_date_head # 2
] )
################
def date_head():
rr = s = seq( [ dayM, b ] )
#
if (s != False):
u = s[0]
rr = u
#
return rr
################
def empty_date_head():
rr = s = seq( [] )
#
if (s != False):
pass
rr = date_none
#
return rr
################################################################
#: date_tail = Year
#: | Cent
#: | vor_nach_um_phrase
#: | zwischen_phrase
#: | um_range
################
# i.e.:
# year
# yearpair
# century
# vor ...
# nach ...
# um ...
# zwischen ... und ...
# ("year" and "century" may be linked)
# returns one of:
# Date (with month == day == None)
# Range
################################
def date_tail():
return oneof(
[ Cent # 1
, Year # 2
, um_range # 1
, vor_nach_um_phrase # 2
, zwischen_phrase
] )
################################################################
#: vor_nach_um_phrase = vor_phrase
#: | nach_phrase
#: | um_phrase
#
#: vor_phrase = vor b fix_date
#: nach_phrase = nach b fix_date
#: um_phrase = um b fix_date
#
#: zwischen_phrase = zwischen b fix_date b und b fix_date
################################
def vor_nach_um_phrase():
return oneof(
[ vor_phrase
, nach_phrase
, um_phrase
] )
################################
def vor_phrase():
rr = s = seq( [ vor, b, fix_date_or_um ] )
#
if (s != False):
u = s[2]
rr = vor_interpretation( u )
#
return rr
################################
def nach_phrase():
rr = s = seq( [ nach, b, fix_date_or_um ] )
#
if (s != False):
u = s[2]
rr = nach_interpretation( u )
#
return rr
################################
def um_phrase():
rr = s = seq( [ um, b, fix_date ] )
#
if (s != False):
u = s[2]
rr = um_interpretation( u )
#
return rr
################################
def zwischen_phrase():
rr = s = seq( [ zwischen, b, fix_date_or_um, b, und, b, fix_date_or_um ] )
#
if (s != False):
u = s[2]; v = s[6]
rr = Range( ( u, v ) )
#
return rr
################################################################
#: um_range = um b range_yc
#
#: range_yc = year_range
#-#: | cent_range
#
#: year_range = number to number [ b bce ]
#-#: cent_range = number dot to number dot b jahrhundert [ b bce ]
################################
def um_range():
rr = s = seq( [ um, b, range_yc ] )
#
if (s != False):
u = s[2]
rr = um_interpretation( u )
#
return rr
################################
def range_yc():
return oneof(
[ year_range
#-# , cent_range
] )
################################
def year_range ():
rr = s = seq( [ number, to, number, bce_option ] )
#
if (s != False):
u = s[0]; v = s[2]; w = s[3]
m = intern_fyear( u[0], w )
n = intern_fyear( v[0], w )
rr = Range( ( Date( fy=m ), Date( fy=n ) ) )
#
return rr
#-#################################
#-#def cent_range ():
#-# rr = s = seq( [ number, dot, to, number, dot, b, jahrhundert, bce_option ] )
#-# #
#-# if (s != False):
#-# u = s[0]; v = s[3]; w = s[7]
#-# m = intern_cent( u[0], w )
#-# n = intern_cent( v[0], w )
#-# rr = Range( ( Date( c=m ), Date( c=n ) ) )
#-# #
#-# return rr
################################################################
def um_clause_right():
return oneof(
[ um_range # 1
, um_phrase # 2
] )
################################
def fix_date_or_um():
return oneof(
[ um_clause_right # 1
, fix_date # 2
] )
################################################################
#: fix_date = dayMY
#: | Year
#: | Cent
#
#: dayMY = dayM b Year
################################
def fix_date():
return oneof(
[ dayMY # 1
, Cent # 1
, Year # 2 # 2
] )
################################
def dayMY():
rr = s = seq( [ dayM, b, Year ] )
#
if (s != False):
u = s[0]; v = s[2]
rr = Date( u, fy=v. fyear() )
#
return rr
################################################################
#: dayM = DM_l | M
#: Year = Y_l | yearpair
#: Cent = C_l
################################
def dayM(): # day and month, maybe linked, or month only
return oneof(
[ DM_l
, M
] )
################################
def Year(): # year, maybe linked, or pair of succesive years
return oneof(
[ yearpair # 1
, Y_l # 2
] )
################################
def Cent(): return C()
#-#def Cent(): return C_l()
################################################################
#: yearpair = number stroke number [ b bce ]
################################
def yearpair ():
rr = s = seq( [ number, stroke, number, bce_option ] )
#
global defectset
if (s != False):
u = s[0]; v = s[2]; w = s[3]
m = intern_fyear( u[0], w )
n = intern_fyear( v[0], w )
#
if ( m+1 != n ):
defectset. add( "years of pair should be succesive years" )
#
rr = Range( ( Date( fy=m ), Date( fy=n ) ) )
#
return rr
################################################################
# "xxx_l" means: "xxx" or "[[xxx]]"
########
#: DM_l = brackets_left DM brackets_right
#: | DM
#
#: Y_l = brackets_left Y brackets_right
#: | Y
#-#:
#-#: C_l = brackets_left C brackets_right
#-#: | C
################################
#-# def C_l (): return may_be_linked( C )
def Y_l (): return may_be_linked( Y )
def DM_l(): return may_be_linked( DM )
################################
def may_be_linked( f ):
return oneof(
[ f
, lambda: linked( f )
] )
################################
def linked( f ):
rr = s = seq( [ brackets_left, f, brackets_right ] )
#
if (s != False):
u = s[1]
rr = u # ignore linking
#
return rr
################################################################
#: DM = D b M
################################
def DM():
rr = s = seq( [ D, b, M ] )
#
if (s != False):
u = s[0]; v = s[2]
rr = Date( u, m=v. m() )
#
return rr
################################################################
#: D = number dot
################################
def D():
rr = s = seq( [ number, dot ] )
#
if (s != False):
u = s[0]
n = intern_day( u[0] )
rr = Date( d=n )
#
return rr
################################################################
#: M = Mnr
################################
def M():
rr = s = seq( [ Mnr ] )
#
if (s != False):
u = s[0]
n = u[0]
rr = Date( m=n )
#
return rr
################################################################
#: Y = number [ b bce ]
################################
def Y():
rr = s = seq( [ number, bce_option ] )
#
if (s != False):
u = s[0]; v = s[1]
n = intern_fyear( u[0], v )
rr = Date( fy=n )
#
return rr
################################################################
#: C = number dot b jahrhundert [ b bce ]
################################
def C():
rr = s = seq( [ number, dot, b, jahrhundert, bce_option ] )
#
if (s != False):
u = s[0]; v = s[4]
n = intern_cent( u[0], v )
rr = Date( c=n )
#
return rr
################################################################
#: bce_option = [ b bce ]
################################
# bce/ce
################
BCE = -1
CE = 1
################################
def bce_option():
return oneof(
[ bce_phrase # 1
, empty_bce_phrase # 2
] )
################################
def bce_phrase ():
rr = s = seq( [ b, bce] )
#
if (s != False):
pass
rr = BCE
#
return rr
################################
def empty_bce_phrase():
rr = s = seq( [] )
#
if (s != False):
pass
rr = CE
#
return rr
################################################################
#: Mnr = Januar
#: | Februar
#: | Maerz
#: | April
#: | Mai
#: | Juni
#: | Juli
#: | August
#: | September
#: | Oktober
#: | November
#: | Dezember
#
#: Januar = "Januar"
#: Februar = "Februar"
#: Maerz = "März"
#: April = "April"
#: Mai = "Mai"
#: Juni = "Juni"
#: Juli = "Juli"
#: August = "August"
#: September = "September"
#: Oktober = "Oktober"
#: November = "November"
#: Dezember = "Dezember"
################################
def Mnr():
return oneof(
[ lambda: Month_nr( 0 )
, lambda: Month_nr( 1 )
, lambda: Month_nr( 2 )
, lambda: Month_nr( 3 )
, lambda: Month_nr( 4 )
, lambda: Month_nr( 5 )
, lambda: Month_nr( 6 )
, lambda: Month_nr( 7 )
, lambda: Month_nr( 8 )
, lambda: Month_nr( 9 )
, lambda: Month_nr( 10 )
, lambda: Month_nr( 11 )
] )
################################
def Month_nr( m ):
rr = s = seq( [ lambda: nextstring( month_info [m] [name] ) ] )
#
if (s != False):
u = s[0]
rr = ( m, )
#
return rr
################################################################
################################################################
def vor_interpretation( u ):
return Range( ( date_none, u. hi() ) )
####
def nach_interpretation( u ):
return Range( ( u. lo(), actual_date ) )
####
def um_interpretation( u ):
r = Range( u )
prec_lo = u. lo(). precision()
prec_hi = u. hi(). precision()
# The widening is determined by the 'finest' available information:
s = r. wide( ( um_widening()[prec_lo-1], um_widening()[prec_hi-1] ) )
if s. hi_extrem() > actual_date:
s = Range( ( s. lo(), actual_date ) )
return s
# ------
def um_widening():
cmax = centuries_handled
widenings = (
# um 12. Jahrh. | um 1111 | um Mai 1111 | um 1. Mai 1111 #
# --------------|---------------|---------------|--------------- #
( century * 0, year * 0, month * 0, day * 0 ) # 0
, ( day * 1, day * 1, day * 1, day * 1 ) # 1
, ( year * 3, month * 1, day * 5, day * 2 ) # 2
, ( year * 10, year * 1, day * 10, day * 3 ) # 3
, ( year * 20, year * 2, day * 20, day * 5 ) # 4
, ( year * 30, year * 3, month * 1, day * 10 ) # 5
, ( year * 50, year * 5, month * 2, day * 20 ) # 6
, ( century * 1, year * 10, month * 3, month * 1 ) # 7
, ( century * 2, year * 20, month * 6, month * 2 ) # 8
, ( century * 10, year * 50, year * 5, month * 6 ) # 9
, ( century * cmax, century * cmax, century * cmax, century * cmax ) # 10
)
return widenings[ um_option ]
################################################################
################################################################
# Representation of the parts of a date:
#
# ========================================================================== #
# | external | internal | -- where -- #
# ------------ | ------------------------- | -------- | -------------------- #
# day | nnn. | nnn-1 | (1 <= nnn <= 31) #
# month | Januar, ..., Dezember | nnn | (0 <= nnn <= 11) #
# | | | #
# year BCE | nnn v. Chr. | -nnn+1 | (1 <= nnn <= 10200) #
# year CE | nnn | nnn | (1 <= nnn <= 2200) #
# | | | #
# century BCE | nnn. Jahrhundert v. Chr. | -nnn+1 | (1 <= nnn <= 102) #
# century CE | nnn. Jahrhundert | nnn | (1 <= nnn <= 22) #
# | | | #
# year/century | -- none -- | nnn | (0 <= nnn <= 99) #
# ========================================================================== #
#
# 'year/century' means 'number of year in its century' here,
# where counting starts with 0.
# Examples:
# 0 ~ year 1901, 9 ~ 1910, 99 ~ 2000
# 0 ~ year 2000 v. Chr., 9 ~ 1991 v. Chr., 99 ~ 1901 v. Chr.
#
# (Mostly 'year' means 'year/century',
# 'fyear' means 'full year', i.e. incl. century.)
#
################################################################
# bounds of internal (partial) dates
################################
# century
########
cent_lo = -111 # first century = 112. century v. Chr.
cent_hi = 22 # last century = 22. century
# --
centuries_handled = ( cent_hi - cent_lo) + 1
################
# year (sin century)
########
year_lo = 0 # first year in its century
year_hi = 99 # last year in its century
# --
years_in_century = ( year_hi - year_lo) + 1
################
# month
########
month_lo = 0 # January
month_hi = 11 # December
# --
months_in_year = (month_hi - month_lo) + 1
################
# day
########
day_lo = 0 # day 1 of a month
day_hi_max = 30 # day 31 of a month
# --
maximal_days_in_month = (day_hi_max - day_lo) + 1
################################################################
################################################################
# conversions extern-intern
################################
# century
########
def extern_cent ( n ): return (-n + 1) if n <= 0 else n
def intern_cent ( n, bce_ce ): return (-n + 1) if bce_ce == BCE else n
# --
def extern_cent_string( n ):
return str( extern_cent ( n ) ) + ". " + cent_str + ( "" if n > 0 else " " + bce_str )
################
# year (con century)
########
def extern_fyear( n ): return (-n + 1) if n <= 0 else n
def intern_fyear( n, bce_ce ): return (-n + 1) if bce_ce == BCE else n
# --
def extern_fyear_string( n ):
return str( extern_fyear ( n ) ) + ( "" if n > 0 else " " + bce_str )
################
# month (as number)
########
def extern_month( n ): return n + 1
def intern_month( n ): return n - 1
################
# day
########
def extern_day ( n ): return n + 1
def intern_day ( n ): return n - 1
# --
def extern_day_string( n ):
return str( extern_day ( n ) )
################################################################
# month information
################################
# keys
################
name = -1
no_leap_year = False
leap_year = True
################################
month_info = ( # The anomaly of October 1582 is not represented in this table.
{ name: "Januar" , no_leap_year: 31, leap_year: 31 },
{ name: "Februar" , no_leap_year: 28, leap_year: 29 },
{ name: "März" , no_leap_year: 31, leap_year: 31 },
{ name: "April" , no_leap_year: 30, leap_year: 30 },
{ name: "Mai" , no_leap_year: 31, leap_year: 31 },
{ name: "Juni" , no_leap_year: 30, leap_year: 30 },
{ name: "Juli" , no_leap_year: 31, leap_year: 31 },
{ name: "August" , no_leap_year: 31, leap_year: 31 },
{ name: "September", no_leap_year: 30, leap_year: 30 },
{ name: "Oktober" , no_leap_year: 31, leap_year: 31 },
{ name: "November" , no_leap_year: 30, leap_year: 30 },
{ name: "Dezember" , no_leap_year: 31, leap_year: 31 },
)
# ------
october = 9 # index in month
################################################################
# The gap in October 1582.
#
# On introducing the Gregorian calender in 1582 10 days were dropped,
# so for 1582 the day after October 4 is October 15.
################################
reform_month = Date( fy=1582, m=october )
# --
pre_gap = Date( reform_month, d=intern_day( 4) ) # 4. 10. 1582
post_gap = Date( reform_month, d=intern_day(15) ) # 15. 10. 1582
#
length_of_gap = post_gap. d() - pre_gap. d() - 1
# --
def length_of_gap_in_month( u ):
return ( 0 if Date( u, d=None ) != reform_month else length_of_gap )
# --
def in_gap( u ):
return ( u. is_date_to_the_day()
and pre_gap < u < post_gap
)
################################################################
def days_in_month( u ):
"Get maximal number of days for a (maybe incomplete) given month."
assert u. month_is_normalized()
c, y, m, d = u. elems()
mo = Date( u, d=None )
if m != None:
mmin = month_info[ m ][ no_leap_year ]
mmax = month_info[ m ][ leap_year ]
if m == None: days = maximal_days_in_month
elif y == None: days = mmax
elif c == None: days = mmin if ((y+1) % 4 != 0) else mmax
else:
if mo < reform_month:
days = ( mmin if ((y+1) % 4 != 0)
else mmax )
else:
days = ( mmin if ((y+1) % 4 != 0)
or ((y+1) == 100) and ( c % 4 != 0 )
else mmax )
if mo == reform_month:
days -= length_of_gap
return days
################
def day_hi( u ):
return days_in_month( u ) + length_of_gap_in_month( u ) - 1
################################################################
# bounds of internal (full) dates
# (Note: In course of computation, dates occur outside of these bounds.)
################################
# bounds for computed values
########
ultima_lo = Date(). lo_extrem() # 1 January 10200 BCE
ultima_hi = Date(). hi_extrem() # 31 December 2300
# ------
ultima = Range( ( ultima_lo, ultima_hi ) )
################
# bounds for input values
########
def get_actual_date():
fy_m_d = time. localtime()[0:3] # (2010, 1, 1, ...) <-- '1 January 2010'
act_extern_fyear, act_extern_month, act_extern_day = fy_m_d
act_fyear = intern_fyear( act_extern_fyear, CE )
act_month = intern_month( act_extern_month )
act_day = intern_day ( act_extern_day + 1 ) # time zones
act_date = Date( fy=act_fyear, m=act_month, d=act_day ). normalize_month()
return act_date
# ------
actual_date = get_actual_date()
# ------
def ultima_input_dates():
ultima_input_lo = Date( ultima. lo(), c= ultima. lo(). c() + 2 )
ultima_input_hi = actual_date
return Range( ( ultima_input_lo, ultima_input_hi ) )
# ------
ultima_input = ultima_input_dates()
################################################################
# bounds of (partial) dates in the input (converted already)
################################
# year (con century)
########
fyear_input_lo = ultima_input. lo(). fyear()
fyear_input_hi = ultima_input. hi(). fyear()
################
# century
########
cent_input_lo = ultima_input. lo(). c()
cent_input_hi = ultima_input. hi(). c()
################################################################
###############################################################
#: b = " "
#: colon = ":"
#: dot = "."
#: stroke = "/"
#: to = "–"
#-#: to = "/" | "–" # alternative: to-Strich ("–")
#: brackets_left = "[["
#: brackets right = "]]"
#: unsicher = "unsicher"
#: getauft = "getauft"
#: begraben = "begraben"
#: oder = "oder"
#: vor = "vor"
#: nach = "nach"
#: zwischen = "zwischen"
#: um = "um"
#: und = "und"
#: jahrhundert = "Jahrhundert"
#: bce = "v. Chr."
################################
cent_str = "Jahrhundert"
bce_str = "v. Chr."
################################
def b (): return nextstring( " " )
def colon (): return nextstring( ":" )
def dot (): return nextstring( "." )
def stroke (): return nextstring( "/" )
def to (): return nextstring( "–" )
#-#def to (): return nextstring( "/" ) or nextstring( "–" )
def brackets_left (): return nextstring( "[[" )
def brackets_right(): return nextstring( "]]" )
def unsicher (): return nextstring( "unsicher" )
def getauft (): return nextstring( "getauft" )
def begraben (): return nextstring( "begraben" )
def oder (): return nextstring( "oder" )
def vor (): return nextstring( "vor" )
def nach (): return nextstring( "nach" )
def zwischen (): return nextstring( "zwischen" )
def und (): return nextstring( "und" )
def um (): return nextstring( "um" )
def jahrhundert (): return nextstring( cent_str )
def bce (): return nextstring( bce_str )
################################################################
# elementary functions:
# nextstring: 1 parameter - string of characters, which should come next
# number : 0 parameters - a number whithout a leading zero should come next
################################
def nextstring( s ):
global string, index_in_string
index = index_in_string
assert ( s != "") # returning "" would be interpreted as False...
l = len( s )
success = (string[ index : index+l ] == s)
if success:
index += l
index_in_string = index
return s if success else False
# -------
def number():
global string, index_in_string, defectset
index = index_in_string
if ( (index + 1 < len(string) ) and string[ index ] == "0"
and string[ index + 1 ]. isdigit() ):
defectset. add( "leading zero" )
index_0 = index
while (index < len(string) and string[index]. isdigit() ):
index += 1
n = int( "0" + string[index_0 : index] )
success = ( index_0 < index )
index_in_string = index
return ( n, ) if success else False
# -------
def nothing_anymore():
success = ( index_in_string == len(string) )
return success
################################################################
# controlling functions:
# seq
# oneof
# repeat
################################
def seq( sequence ):
global index_in_string
index = index_in_string
result = []
fx = True
for f in sequence:
fx = f()
if fx == False: break
result .append( fx )
if fx == False:
index_in_string = index
result = False
return result
# ------
def oneof( options ):
result = False
for f in options:
fx = f()
if fx == False: continue
result = fx
break
return result
# -------
def repeat( f ):
result = []
while True:
fx = f()
if fx == False: break
result .append( fx )
return result
################################################################
string = ""
index_in_string = 0
defectset = set()
################################
def handle_iline( iline ): # WORK
"Handles a line."
# uses package 're'
global string, index_in_string, defectset
LF = '\n'
olines = []
date_of_births = re. findall( 'GEBURTSDATUM[^|]*', iline )
date_of_deaths = re. findall( 'STERBEDATUM[^|]*' , iline )
for dat in date_of_births:
dat = dat[ len("GEBURTSDATUM"): ]. lstrip(). lstrip( "=" ). strip()
string, index_in_string = dat, 0
defectset = set()
r = date_of_birth ()
if ( r == False ):
defectset. add( "syntax error" )
ll = []
if ( r != False ):
ll = list_elems_depth_first( r[ dates] )
for l in ll:
defectset. update( l. defects() )
mm = ""
delim = "|| "
for m in defectset:
mm += delim + m
if mm != "":
oline = iline[:-1] + mm + "\n"
olines .append( oline )
for dat in date_of_deaths :
dat = dat[ len("STERBEDATUM" ): ]. lstrip(). lstrip( "=" ). strip()
string, index_in_string = dat, 0
defectset = set()
r = date_of_death ()
if ( r == False ):
defectset. add( "syntax error" )
ll = []
if ( r != False ):
ll = list_elems_depth_first( r[ dates] )
for l in ll:
defectset. update( l. defects() )
mm = ""
delim = "|| "
for m in defectset:
mm += delim + m
if mm != "":
oline = iline[:-1] + mm + "\n"
olines .append( oline )
return olines
################################
def handle_oline( fo, oline ):
fo .write( oline )
################################################################
################################################################
def main( file_names ):
"Organizes total work."
files = open_files( file_names )
fi, fo = files
for iline in fi:
olines = handle_iline( iline )
for oline in olines:
handle_oline( fo, oline )
close_files( files )
################################
def open_files( names ):
files = []
modes = file_modes
for f, m in zip(names, modes):
files. append( open_file( f, m ) )
return files
####
def close_files( files ):
for f in files:
close_file( f )
########
def open_file( name, mode ):
"Opens a file, if necessary."
if name == "-" : return standardfile( mode )
else : return file( name, mode )
####
def close_file( f ):
"Closes a file, if necessary."
if is_standard_file( f ): pass
else : f .close()
########
def standardfile( mode ):
"Chooses a standardfile, depending on mode."
if mode == 'r': return sys. stdin
elif mode == 'w': return sys. stdout
####
def is_standard_file( f ):
"Is the given file a standard-file?"
return ( f in [sys. stdin, sys. stdout, sys. stderr] )
################################################################
number_of_wanted_arguments = 2 # names of inputfile, outputfile
# or 1 inputfile; outputfile: standard file
# or 0 inputfile and outputfile: standard file
file_modes = ['r', 'w']
########
def start_program():
"Interprets arguments and starts work."
arguments = sys .argv[1:]
n = number_of_wanted_arguments
while ( len(arguments) > 0
and arguments[0]. startswith( '-' )
and arguments[0] != "-"
or
len(arguments) > n
):
option = arguments .pop(0)
if option == "--":
break
evaluate( option, arguments )
while len(arguments) < n:
arguments .append( "-" ) # omitted file --> standard file
file_names = arguments[ -n:]
main( file_names )
########
um_option = 7 # default value (for now)
# --
def evaluate( option, arguments ):
global um_option
if ( 2 <= len( option ) <= 3
and option[0] == "-"
and str. isdigit( option[1:] )
):
um_option = int( option[1:] )
else:
print 'Unidentified option \"' + option + '\" will be ignored.'
################
start_program()
################################################################
# '''
'''
?>