(* $Id: sample.ml 744 2009-02-02 21:10:02Z gerd $ * ---------------------------------------------------------------------- * *) (**********************************************************************) (* Examples for event-based parsing ("SAX-like parsing") *) (**********************************************************************) open Pxp_ev_parser open Pxp_lexer_types open Pxp_types open Expr open Exprlex open Printf (* dump_event: dumps a single parsing event *) let dump_event e = print_endline (Pxp_event.string_of_event e) (* parse: prints the events while parsing the passed string *) let parse s = process_entity default_config (`Entry_document[]) (create_entity_manager default_config (from_string s)) dump_event; flush stdout (* curly_parse: demonstrates how to use escape_contents. The character * { escapes from normal parsing and calls [escape]. The arithmetic * expression inside { ... } is evaluated, and the result is taken * as the text content. * Try: curly_parse "<a>{123 + 5}</a>" * curly_parse "<a>{{123 + 5}}</a>" * curly_parse "<a>{123 + 5</a>}</a>" *) let inc_col (l,c) = (l,c+1);; let inc_line (l,c) = (l+1,0);; let add_col n (l,c) = (l,c+n);; let curly_parse s = let parse_expr mng = (* We get now the current lexical buffer of PXP, and use it for * our own parsing. In particular, we call Expr.topexpr * to parse the arithmetic expression. While parsing, * we track the current line and column (function [scan]). *) let line_col = ref (mng # current_line_column) in (* Note: current_line_column returns the position of the beginning of * the token that has been parsed last. The last token was "{" (Lcurly). * So we must add 1 to this position to get the position of the * beginning of the next token. *) line_col := inc_col !line_col; let rec scan buf = match scan_expr buf with Newline -> line_col := inc_line !line_col; scan buf | Space -> let n = Lexing.lexeme_end buf - Lexing.lexeme_start buf in line_col := add_col n !line_col; scan buf | tok -> let n = Lexing.lexeme_end buf - Lexing.lexeme_start buf in line_col := add_col n !line_col; tok in let lexbuf = match mng # current_lexer_obj # lexbuf with | `Ocamllex lexbuf -> lexbuf | `Netulex _ -> failwith "Netulex lexbuf not supported" in let value = topexpr scan lexbuf in printf "Result of expression: %d\n" value; mng # update_line_column !line_col; string_of_int value in let escape_contents tok mng = (* This function is called when "{", "{{", "}", or "}}" are found in * character node context. *) match tok with Lcurly -> (* "{" found: *) parse_expr mng | LLcurly -> (* "{{" found: map to "{" *) "{" | Rcurly -> (* "}" found *) failwith "Single brace } not allowed" | RRcurly -> (* "}}" found: map to "}" *) "}" | _ -> assert false in let escape_attributes tok pos mng = (* This function is called when "{", "{{", "}", or "}}" are found in * attribute values. *) match tok with Lcurly -> (* "{" found: *) parse_expr mng | LLcurly -> (* "{{" found: *) "{" | Rcurly -> (* "}" found: *) failwith "Single brace } not allowed" | RRcurly -> (* "}}" found: *) "}" | _ -> assert false in let config = { default_config with escape_contents = Some escape_contents; escape_attributes = Some escape_attributes; } in process_entity config (`Entry_document[]) (create_entity_manager config (from_string s)) dump_event; flush stdout ;; (* rec_curly_parse: Here, escape_contents calls the XML parser recursively, * i.e. you can write XML documents inside curly braces, like in * rec_curly_parse "<A> { <B> x </B> } </A>" or * rec_curly_parse "<A att='{ <B> x </B> }'>y</A>" * * This is currently very experimental! *) let rec_curly_parse s = let ent_id_guard = Pxp_dtd.Entity.create_entity_id() in let base_config = default_config in let rec escape ent_id tok mng = (* ent_id: is the entity ID containing the last Lcurly, or ent_id_guard * when there was none yet *) let current_ent = mng # current_entity in let current_ent_id = (current_ent :> entity_id) in match tok with Lcurly -> printf "Starting subparser...\n"; (* Because [current_ent] is already open, we cannot use it as * top-level entity in [process_entity] (it is not possible to * open an entity several times). The solution is [sub_ent], * a so-called entity section that behaves much like [current_ent] * and shares most of the state with [current_ent], but pretends * it were an entity of its own. *) let sub_ent = new Pxp_entity.entity_section current_ent in let sub_ent_id = (sub_ent :> entity_id) in let sub_config = { base_config with escape_contents = Some (escape sub_ent_id) ; escape_attributes = Some (escape_att sub_ent_id) ; } in (* Pushing sub_ent makes it the top-level entity: *) mng # push_entity sub_ent; process_entity sub_config (`Entry_document[]) mng dump_event; assert(mng # current_entity = sub_ent); (* Pop sub_ent afterwards: *) mng # pop_entity (); "" | LLcurly -> "{" | Rcurly -> if ent_id = ent_id_guard then (* A right curly brace without matching left curly brace *) failwith "Single brace } not allowed" else if ent_id = current_ent_id then ( (* The normal case: *) printf "Stopping subparser...\n"; ignore(current_ent # close_entity); "" (* Causes that the current [process_entity] parser invocation * terminates (if possible) *) ) else (* ent_id <> current_ent_id: This can happen if entities and * braces are used in strange ways: * <!DOCTYPE a [ <!ENTITY brace '}'> ]> * <a> { <b>xxx</b> &brace; </a> *) failwith "Bad nesting of entities and braces {...}" | RRcurly -> "}" | _ -> assert false and escape_att ent_id tok pos mng = escape ent_id tok mng in let config = { base_config with escape_contents = Some (escape ent_id_guard); escape_attributes = Some (escape_att ent_id_guard); } in process_entity config (`Entry_document[]) (create_entity_manager config (from_string s)) dump_event; flush stdout ;; (* parse_expr: An example for process_expr that parses the expressions * found in a string one after another. * Example: * parse_expr "<?pi?><abc>def</abc> <qrt>def</qrt> " * * Unfortunately, we need the undocumented methods [open_entity] and * [close_entity] from [Pxp_entity], and we need the knowledge that * a [Begin_entity] token can be found at the beginning of the entity, * and that an [End_entity] token signals the end of the entity. Using * [process_expr] is quite low-level. *) let parse_expr s = let config = { default_config with enable_pinstr_nodes = true; enable_comment_nodes = true } in let m = create_entity_manager config (from_string s) in m # current_entity # open_entity true Pxp_lexer_types.Content; let begin_entity_token = !(m # yy_get_next_ref)() in assert (begin_entity_token = Begin_entity); let tok = ref Ignore in (* Ignore does not occur in the token stream *) while !tok <> End_entity do let first_token = if !tok <> Ignore then Some !tok else None in printf "*** Calling process_expr...\n"; process_expr ?first_token ~following_token:tok config m dump_event; done; ignore(m # current_entity # close_entity); ;;