(* $Id: main.ml 745 2009-02-02 21:23:48Z gerd $ * ---------------------------------------------------------------------- * *) (* Convert XHTML-1.0 document into HTML-4.01 document. The input is parsed * in well-formedness mode, and is not validated. The input should have one * of these DOCTYPEs: * - PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" * - PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" * - PUBLIC "-//W3C//DTD XHTML 1.0 Frameset//EN" * * The input must declare the HTML namespace as http://www.w3.org/1999/xhtml. * * The XHTML entity sets (Latin 1, Special, Symbol) can be assumed. * In output, always numeric entities are printed. * * Processing instructions and comments are removed from the document. * * Output encoding is 7 bit ASCII. * * TODO: * - id/name: These attributes are not yet checked, but simply printed as * found in the input * - XML stylesheets are not yet transformed to HTML stylesheets * (conversion of <?xml-stylesheet ...?> to <style ...>) *) open Pxp_tree_parser;; open Pxp_types;; open Pxp_document;; type html_doctype = [ `Frameset | `Transitional | `Strict ] ;; (**********************************************************************) (* XHTML entities *) (**********************************************************************) let xhtml_lat1_ent = "<?xml version='1.0' encoding='UTF-8'?> <!ENTITY nbsp ' '> <!ENTITY iexcl '¡'> <!ENTITY cent '¢'> <!ENTITY pound '£'> <!ENTITY curren '¤'> <!ENTITY yen '¥'> <!ENTITY brvbar '¦'> <!ENTITY sect '§'> <!ENTITY uml '¨'> <!ENTITY copy '©'> <!ENTITY ordf 'ª'> <!ENTITY laquo '«'> <!ENTITY not '¬'> <!ENTITY shy '­'> <!ENTITY reg '®'> <!ENTITY macr '¯'> <!ENTITY deg '°'> <!ENTITY plusmn '±'> <!ENTITY sup2 '²'> <!ENTITY sup3 '³'> <!ENTITY acute '´'> <!ENTITY micro 'µ'> <!ENTITY para '¶'> <!ENTITY middot '·'> <!ENTITY cedil '¸'> <!ENTITY sup1 '¹'> <!ENTITY ordm 'º'> <!ENTITY raquo '»'> <!ENTITY frac14 '¼'> <!ENTITY frac12 '½'> <!ENTITY frac34 '¾'> <!ENTITY iquest '¿'> <!ENTITY Agrave 'À'> <!ENTITY Aacute 'Á'> <!ENTITY Acirc 'Â'> <!ENTITY Atilde 'Ã'> <!ENTITY Auml 'Ä'> <!ENTITY Aring 'Å'> <!ENTITY AElig 'Æ'> <!ENTITY Ccedil 'Ç'> <!ENTITY Egrave 'È'> <!ENTITY Eacute 'É'> <!ENTITY Ecirc 'Ê'> <!ENTITY Euml 'Ë'> <!ENTITY Igrave 'Ì'> <!ENTITY Iacute 'Í'> <!ENTITY Icirc 'Î'> <!ENTITY Iuml 'Ï'> <!ENTITY ETH 'Ð'> <!ENTITY Ntilde 'Ñ'> <!ENTITY Ograve 'Ò'> <!ENTITY Oacute 'Ó'> <!ENTITY Ocirc 'Ô'> <!ENTITY Otilde 'Õ'> <!ENTITY Ouml 'Ö'> <!ENTITY times '×'> <!ENTITY Oslash 'Ø'> <!ENTITY Ugrave 'Ù'> <!ENTITY Uacute 'Ú'> <!ENTITY Ucirc 'Û'> <!ENTITY Uuml 'Ü'> <!ENTITY Yacute 'Ý'> <!ENTITY THORN 'Þ'> <!ENTITY szlig 'ß'> <!ENTITY agrave 'à'> <!ENTITY aacute 'á'> <!ENTITY acirc 'â'> <!ENTITY atilde 'ã'> <!ENTITY auml 'ä'> <!ENTITY aring 'å'> <!ENTITY aelig 'æ'> <!ENTITY ccedil 'ç'> <!ENTITY egrave 'è'> <!ENTITY eacute 'é'> <!ENTITY ecirc 'ê'> <!ENTITY euml 'ë'> <!ENTITY igrave 'ì'> <!ENTITY iacute 'í'> <!ENTITY icirc 'î'> <!ENTITY iuml 'ï'> <!ENTITY eth 'ð'> <!ENTITY ntilde 'ñ'> <!ENTITY ograve 'ò'> <!ENTITY oacute 'ó'> <!ENTITY ocirc 'ô'> <!ENTITY otilde 'õ'> <!ENTITY ouml 'ö'> <!ENTITY divide '÷'> <!ENTITY oslash 'ø'> <!ENTITY ugrave 'ù'> <!ENTITY uacute 'ú'> <!ENTITY ucirc 'û'> <!ENTITY uuml 'ü'> <!ENTITY yacute 'ý'> <!ENTITY thorn 'þ'> <!ENTITY yuml 'ÿ'>" ;; let xhtml_special_ent = "<?xml version='1.0' encoding='UTF-8'?> <!ENTITY OElig 'Œ'> <!ENTITY oelig 'œ'> <!ENTITY Scaron 'Š'> <!ENTITY scaron 'š'> <!ENTITY Yuml 'Ÿ'> <!ENTITY circ 'ˆ'> <!ENTITY tilde '˜'> <!ENTITY ensp ' '> <!ENTITY emsp ' '> <!ENTITY thinsp ' '> <!ENTITY zwnj '‌'> <!ENTITY zwj '‍'> <!ENTITY lrm '‎'> <!ENTITY rlm '‏'> <!ENTITY ndash '–'> <!ENTITY mdash '—'> <!ENTITY lsquo '‘'> <!ENTITY rsquo '’'> <!ENTITY sbquo '‚'> <!ENTITY ldquo '“'> <!ENTITY rdquo '”'> <!ENTITY bdquo '„'> <!ENTITY dagger '†'> <!ENTITY Dagger '‡'> <!ENTITY permil '‰'> <!ENTITY lsaquo '‹'> <!ENTITY rsaquo '›'> <!ENTITY euro '€'>" ;; let xhtml_symbol_ent = "<?xml version='1.0' encoding='UTF-8'?> <!ENTITY fnof 'ƒ'> <!ENTITY Alpha 'Α'> <!ENTITY Beta 'Β'> <!ENTITY Gamma 'Γ'> <!ENTITY Delta 'Δ'> <!ENTITY Epsilon 'Ε'> <!ENTITY Zeta 'Ζ'> <!ENTITY Eta 'Η'> <!ENTITY Theta 'Θ'> <!ENTITY Iota 'Ι'> <!ENTITY Kappa 'Κ'> <!ENTITY Lambda 'Λ'> <!ENTITY Mu 'Μ'> <!ENTITY Nu 'Ν'> <!ENTITY Xi 'Ξ'> <!ENTITY Omicron 'Ο'> <!ENTITY Pi 'Π'> <!ENTITY Rho 'Ρ'> <!ENTITY Sigma 'Σ'> <!ENTITY Tau 'Τ'> <!ENTITY Upsilon 'Υ'> <!ENTITY Phi 'Φ'> <!ENTITY Chi 'Χ'> <!ENTITY Psi 'Ψ'> <!ENTITY Omega 'Ω'> <!ENTITY alpha 'α'> <!ENTITY beta 'β'> <!ENTITY gamma 'γ'> <!ENTITY delta 'δ'> <!ENTITY epsilon 'ε'> <!ENTITY zeta 'ζ'> <!ENTITY eta 'η'> <!ENTITY theta 'θ'> <!ENTITY iota 'ι'> <!ENTITY kappa 'κ'> <!ENTITY lambda 'λ'> <!ENTITY mu 'μ'> <!ENTITY nu 'ν'> <!ENTITY xi 'ξ'> <!ENTITY omicron 'ο'> <!ENTITY pi 'π'> <!ENTITY rho 'ρ'> <!ENTITY sigmaf 'ς'> <!ENTITY sigma 'σ'> <!ENTITY tau 'τ'> <!ENTITY upsilon 'υ'> <!ENTITY phi 'φ'> <!ENTITY chi 'χ'> <!ENTITY psi 'ψ'> <!ENTITY omega 'ω'> <!ENTITY thetasym 'ϑ'> <!ENTITY upsih 'ϒ'> <!ENTITY piv 'ϖ'> <!ENTITY bull '•'> <!ENTITY hellip '…'> <!ENTITY prime '′'> <!ENTITY Prime '″'> <!ENTITY oline '‾'> <!ENTITY frasl '⁄'> <!ENTITY weierp '℘'> <!ENTITY image 'ℑ'> <!ENTITY real 'ℜ'> <!ENTITY trade '™'> <!ENTITY alefsym 'ℵ'> <!ENTITY larr '←'> <!ENTITY uarr '↑'> <!ENTITY rarr '→'> <!ENTITY darr '↓'> <!ENTITY harr '↔'> <!ENTITY crarr '↵'> <!ENTITY lArr '⇐'> <!ENTITY uArr '⇑'> <!ENTITY rArr '⇒'> <!ENTITY dArr '⇓'> <!ENTITY hArr '⇔'> <!ENTITY forall '∀'> <!ENTITY part '∂'> <!ENTITY exist '∃'> <!ENTITY empty '∅'> <!ENTITY nabla '∇'> <!ENTITY isin '∈'> <!ENTITY notin '∉'> <!ENTITY ni '∋'> <!ENTITY prod '∏'> <!ENTITY sum '∑'> <!ENTITY minus '−'> <!ENTITY lowast '∗'> <!ENTITY radic '√'> <!ENTITY prop '∝'> <!ENTITY infin '∞'> <!ENTITY ang '∠'> <!ENTITY and '∧'> <!ENTITY or '∨'> <!ENTITY cap '∩'> <!ENTITY cup '∪'> <!ENTITY int '∫'> <!ENTITY there4 '∴'> <!ENTITY sim '∼'> <!ENTITY cong '≅'> <!ENTITY asymp '≈'> <!ENTITY ne '≠'> <!ENTITY equiv '≡'> <!ENTITY le '≤'> <!ENTITY ge '≥'> <!ENTITY sub '⊂'> <!ENTITY sup '⊃'> <!ENTITY nsub '⊄'> <!ENTITY sube '⊆'> <!ENTITY supe '⊇'> <!ENTITY oplus '⊕'> <!ENTITY otimes '⊗'> <!ENTITY perp '⊥'> <!ENTITY sdot '⋅'> <!ENTITY lceil '⌈'> <!ENTITY rceil '⌉'> <!ENTITY lfloor '⌊'> <!ENTITY rfloor '⌋'> <!ENTITY lang '〈'> <!ENTITY rang '〉'> <!ENTITY loz '◊'> <!ENTITY spades '♠'> <!ENTITY clubs '♣'> <!ENTITY hearts '♥'> <!ENTITY diams '♦'>" ;; let xhtml_doctype = "<?xml version='1.0' encoding='UTF-8'?> <!ENTITY % HTMLlat1 PUBLIC '-//W3C//ENTITIES Latin 1 for XHTML//EN' ''> <!ENTITY % HTMLspecial PUBLIC '-//W3C//ENTITIES Special for XHTML//EN' ''> <!ENTITY % HTMLsymbol PUBLIC '-//W3C//ENTITIES Symbols for XHTML//EN' ''> %HTMLlat1; %HTMLspecial; %HTMLsymbol;" ;; let parse filename = let namespace_manager = new Pxp_dtd.namespace_manager in namespace_manager # add_namespace "html" "http://www.w3.org/1999/xhtml"; (* Force that internally the namespace prefix "html" is used for html * namespace. PXP converts the namespace prefixes found in the input * document to "html". *) let catalog = Pxp_reader.lookup_public_id_as_string [ "-//W3C//ENTITIES Latin 1 for XHTML//EN", xhtml_lat1_ent; "-//W3C//ENTITIES Special for XHTML//EN", xhtml_special_ent; "-//W3C//ENTITIES Symbols for XHTML//EN", xhtml_symbol_ent; "-//W3C//DTD XHTML 1.0 Strict//EN", xhtml_doctype; "-//W3C//DTD XHTML 1.0 Transitional//EN", xhtml_doctype; "-//W3C//DTD XHTML 1.0 Frameset//EN", xhtml_doctype; ] in let reader = new Pxp_reader.combine [ new Pxp_reader.resolve_as_file(); catalog; ] in let config = { default_namespace_config with encoding = `Enc_utf8; enable_namespace_processing = Some namespace_manager; } in let filename_url = Pxp_reader.make_file_url filename in let filename_url_string = Neturl.string_of_url filename_url in parse_wfdocument_entity config (ExtID ((System filename_url_string), reader)) default_namespace_spec ;; let recognize_xhtml_doctype doc = match doc # dtd # id with Some(External(Public("-//W3C//DTD XHTML 1.0 Strict//EN",_))) -> `Strict | Some(Derived(Public("-//W3C//DTD XHTML 1.0 Strict//EN",_))) -> `Strict | Some(External(Public("-//W3C//DTD XHTML 1.0 Transitional//EN",_))) -> `Transitional | Some(Derived(Public("-//W3C//DTD XHTML 1.0 Transitional//EN",_))) -> `Transitional | Some(External(Public("-//W3C//DTD XHTML 1.0 Frameset//EN",_))) -> `Frameset | Some(Derived(Public("-//W3C//DTD XHTML 1.0 Frameset//EN",_))) -> `Frameset | _ -> prerr_endline("Warning: DOCTYPE not recognized, assuming HTML-4.01-transitional"); `Transitional (* Just a default... *) ;; let html401_doctype_string = [ `Frameset, "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Frameset//EN\" \"http://www.w3.org/TR/html4/frameset.dtd\">"; `Transitional, "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" \"http://www.w3.org/TR/html4/loose.dtd\">"; `Strict, "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01//EN\" \"http://www.w3.org/TR/html4/strict.dtd\">"; ] ;; let element_declared_as_empty name = try snd(List.assoc name Nethtml.html40_dtd) = `Empty with Not_found -> false ;; let convert_and_print ?(remove = "") (doc : 'ext document) = (* Only "html" prefix elements are printed, other namespaces are dropped. * *) let escape_html = let in_enc = `Enc_utf8 in let out_enc = `Enc_subset(`Enc_usascii, function 64 -> false (* Anti-SPAM *) | 58 -> false (* Anti-SPAM *) | _ -> true) in Netencoding.Html.encode ~in_enc ~out_enc ~prefer_name:false () in let print_data s = print_string (escape_html s) in let print_url s = let p = String.length remove in let l = String.length s in if p < l & String.sub s 0 p = remove then print_data(String.sub s p (l-p)) else print_data s in let rec print_attlist (node : 'ext node) = List.iter (fun (name,value) -> print_string name; print_string "=\""; ( match value with Value s -> if name="href" || name="src" then print_url s else print_data s | _ -> assert false (* Impossible in wf mode *) ); print_string "\" "; ) node#attributes in let rec print (node : 'ext node) = match node # node_type with T_element _ -> if node # normprefix = "html" then begin let htmlname = node # localname in let empty = element_declared_as_empty htmlname in print_string "<"; print_string htmlname; print_string " "; print_attlist node; print_string ">"; node # iter_nodes print; if not empty then begin print_string "</"; print_string htmlname; print_string ">"; end end | T_data -> print_data node#data | _ -> () (* Drop... *) in (* Detect DOCTYPE: *) let html_doctype = recognize_xhtml_doctype doc in (* Print DOCTYPE: *) let doctype_string = List.assoc html_doctype html401_doctype_string in print_endline doctype_string; (* Convert and print document: *) print doc#root; print_newline(); flush stdout ;; let main() = let file = ref "" in let remove = ref "" in Arg.parse [ "-remove", Arg.String (fun s -> remove := s), "<prefix> Remove this prefix from all URLs"; ] (fun s -> file := s) "Usage: xhtml2html file.xml >file.html"; if !file = "" then failwith "No input file! See xhtml2html -help for usage information."; let doc = parse !file in convert_and_print ~remove:!remove doc ;; try main() with error -> prerr_endline("*** xhtml2html stopped:"); prerr_endline(Pxp_types.string_of_exn error); exit 1 ;;