#Copyright ReportLab Europe Ltd. 2000-2017 #see license.txt for license details #history https://bitbucket.org/rptlab/reportlab/history-node/tip/src/reportlab/tools/docco/t_parse.py """ Template parsing module inspired by REXX (with thanks to Donn Cave for discussion). Template initialization has the form: T = Template(template_string, wild_card_marker, single_char_marker, x = regex_x, y = regex_y, ...) Parsing has the form ([match1, match2, ..., matchn], lastindex) = T.PARSE(string) Only the first argument is mandatory. The resultant object efficiently parses strings that match the template_string, giving a list of substrings that correspond to each "directive" of the template. Template directives: Wildcard: The template may be initialized with a wildcard that matches any string up to the string matching the next directive (which may not be a wild card or single character marker) or the next literal sequence of characters of the template. The character that represents a wildcard is specified by the wild_card_marker parameter, which has no default. For example, using X as the wildcard: >>> T = Template("prefixXinteriorX", "X") >>> T.PARSE("prefix this is before interior and this is after") ([' this is before ', ' and this is after'], 47) >>> T = Template("<X>X<X>", "X") >>> T.PARSE('<A HREF="index.html">go to index</A>') (['A HREF="index.html"', 'go to index', '/A'], 36) Obviously the character used to represent the wildcard must be distinct from the characters used to represent literals or other directives. Fixed length character sequences: The template may have a marker character which indicates a fixed length field. All adjacent instances of this marker will be matched by a substring of the same length in the parsed string. For example: >>> T = Template("NNN-NN-NNNN", single_char_marker="N") >>> T.PARSE("1-2-34-5-12") (['1-2', '34', '5-12'], 11) >>> T.PARSE("111-22-3333") (['111', '22', '3333'], 11) >>> T.PARSE("1111-22-3333") ValueError: literal not found at (3, '-') A template may have multiple fixed length markers, which allows fixed length fields to be adjacent, but recognized separately. For example: >>> T = Template("MMDDYYX", "X", "MDY") >>> T.PARSE("112489 Somebody's birthday!") (['11', '24', '89', " Somebody's birthday!"], 27) Regular expression markers: The template may have markers associated with regular expressions. the regular expressions may be either string represenations of compiled. For example: >>> T = Template("v: s i", v=id, s=str, i=int) >>> T.PARSE("this_is_an_identifier: 'a string' 12344") (['this_is_an_identifier', "'a string'", '12344'], 39) >>> Here id, str, and int are regular expression conveniences provided by this module. Directive markers may be mixed and matched, except that wildcards cannot precede wildcards or single character markers. Example: >>> T = Template("ssnum: NNN-NN-NNNN, fn=X, ln=X, age=I, quote=Q", "X", "N", I=int, Q=str) >>> T.PARSE("ssnum: 123-45-6789, fn=Aaron, ln=Watters, age=13, quote='do be do be do'") (['123', '45', '6789', 'Aaron', 'Watters', '13', "'do be do be do'"], 72) >>> """ import re, string from reportlab.lib.utils import ascii_letters # # template parsing # # EG: T = Template("(NNN)NNN-NNNN X X", "X", "N") # ([area, exch, ext, fn, ln], index) = T.PARSE("(908)949-2726 Aaron Watters") # class Template: def __init__(self, template, wild_card_marker=None, single_char_marker=None, **marker_to_regex_dict): self.template = template self.wild_card = wild_card_marker self.char = single_char_marker # determine the set of markers for this template markers = list(marker_to_regex_dict.keys()) if wild_card_marker: markers.append(wild_card_marker) if single_char_marker: for ch in single_char_marker: # allow multiple scm's markers.append(ch) self.char = single_char_primary = single_char_marker[0] self.markers = markers for mark in markers: if len(mark)>1: raise ValueError("Marks must be single characters: "+repr(mark)) # compile the regular expressions if needed self.marker_dict = marker_dict = {} for mark, rgex in marker_to_regex_dict.items(): if isinstance(rgex,str): rgex = re.compile(rgex) marker_dict[mark] = rgex # determine the parse sequence parse_seq = [] # dummy last char lastchar = None index = 0 last = len(template) # count the number of directives encountered ndirectives = 0 while index<last: start = index thischar = template[index] # is it a wildcard? if thischar == wild_card_marker: if lastchar == wild_card_marker: raise ValueError("two wild cards in sequence is not allowed") parse_seq.append( (wild_card_marker, None) ) index = index+1 ndirectives = ndirectives+1 # is it a sequence of single character markers? elif single_char_marker and thischar in single_char_marker: if lastchar == wild_card_marker: raise ValueError("wild card cannot precede single char marker") while index<last and template[index] == thischar: index = index+1 parse_seq.append( (single_char_primary, index-start) ) ndirectives = ndirectives+1 # is it a literal sequence? elif not thischar in markers: while index<last and not template[index] in markers: index = index+1 parse_seq.append( (None, template[start:index]) ) # otherwise it must be a re marker else: rgex = marker_dict[thischar] parse_seq.append( (thischar, rgex) ) ndirectives = ndirectives+1 index = index+1 lastchar = template[index-1] self.parse_seq = parse_seq self.ndirectives = ndirectives def PARSE(self, s, start=0): ndirectives = self.ndirectives wild_card = self.wild_card single_char = self.char parse_seq = self.parse_seq lparse_seq = len(parse_seq) - 1 # make a list long enough for substitutions for directives result = [None] * ndirectives current_directive_index = 0 currentindex = start # scan through the parse sequence, recognizing for parse_index in range(lparse_seq + 1): (indicator, data) = parse_seq[parse_index] # is it a literal indicator? if indicator is None: if s.find(data, currentindex) != currentindex: raise ValueError("literal not found at "+repr((currentindex,data))) currentindex = currentindex + len(data) else: # anything else is a directive # is it a wildcard? if indicator == wild_card: # if it is the last directive then it matches the rest of the string if parse_index == lparse_seq: last = len(s) # otherwise must look at next directive to find end of wildcard else: # next directive must be re or literal (nextindicator, nextdata) = parse_seq[parse_index+1] if nextindicator is None: # search for literal last = s.find(nextdata, currentindex) if last<currentindex: raise ValueError("couldn't terminate wild with lit "+repr(currentindex)) else: # data is a re, search for it last = nextdata.search(s, currentindex) if last<currentindex: raise ValueError("couldn't terminate wild with re "+repr(currentindex)) elif indicator == single_char: # data is length to eat last = currentindex + data else: # other directives are always regular expressions last = data.match(s, currentindex) + currentindex if last<currentindex: raise ValueError("couldn't match re at "+repr(currentindex)) #print("accepting", s[currentindex:last]) result[current_directive_index] = s[currentindex:last] current_directive_index = current_directive_index+1 currentindex = last # sanity check if current_directive_index != ndirectives: raise SystemError("not enough directives found?") return (result, currentindex) # some useful regular expressions USERNAMEREGEX = \ "["+ascii_letters+"]["+ascii_letters+string.digits+"_]*" STRINGLITREGEX = "'[^\n']*'" SIMPLEINTREGEX = "["+string.digits+"]+" id = re.compile(USERNAMEREGEX) str = re.compile(STRINGLITREGEX) int = re.compile(SIMPLEINTREGEX) def test(): global T, T1, T2, T3 T = Template("(NNN)NNN-NNNN X X", "X", "N") print(T.PARSE("(908)949-2726 Aaron Watters")) T1 = Template("s --> s blah", s=str) s = "' <-- a string --> ' --> 'blah blah another string blah' blah" print(T1.PARSE(s)) T2 = Template("s --> NNNiX", "X", "N", s=str, i=int) print(T2.PARSE("'A STRING' --> 15964653alpha beta gamma")) T3 = Template("XsXi", "X", "N", s=str, i=int) print(T3.PARSE("prefix'string'interior1234junk not parsed")) T4 = Template("MMDDYYX", "X", "MDY") print(T4.PARSE("122961 Somebody's birthday!")) if __name__=="__main__": test()