(* $Id: ucs2_to_utf8.ml 665 2004-06-02 10:54:48Z gerd $ * ---------------------------------------------------------------------- * *) (******************************************************) (* Claudio Sacerdoti Coen <sacerdot@cs.unibo.it> *) (* 14/05/2000 *) (******************************************************) (* [14-Jun-2001] Slightly modified by Gerd Stolpmann *) (* [24-Aug-2002] Generalized for the code range 0-0x1fffff by gs *) (* Surrogate Pairs are not accepted in XML files (is it true???) *) exception SurrogatePairs;; (* Interval (n,m) where n >m m *) exception InvalidInterval of int * int;; (* Given an ucs2 character code, returns it in utf8 *) (* (as a concatenation of characters) *) let char_ucs2_to_utf8 = function n when n >= 0xD800 && n <= 0xDFFF -> raise SurrogatePairs | n when n <= 0x007F -> Uni_types.Char n | n when n <= 0x07FF -> Uni_types.Concat [[Uni_types.Char (n lsr 6 land 0b00011111 lor 0b11000000)] ; [Uni_types.Char (n land 0b00111111 lor 0b10000000)]] | n when n <= 0xffff -> Uni_types.Concat [[Uni_types.Char (n lsr 12 land 0b00001111 lor 0b11100000)] ; [Uni_types.Char (n lsr 6 land 0b00111111 lor 0b10000000)] ; [Uni_types.Char (n land 0b00111111 lor 0b10000000)]] | n when n <= 0x1fffff -> Uni_types.Concat [[Uni_types.Char (n lsr 18 land 0b00000111 lor 0b11110000)] ; [Uni_types.Char (n lsr 12 land 0b00111111 lor 0b10000000)] ; [Uni_types.Char (n lsr 6 land 0b00111111 lor 0b10000000)] ; [Uni_types.Char (n land 0b00111111 lor 0b10000000)]] | _ -> failwith "Code point is outside the supported range 0..0x1fffff" ;; (*CSC: Two functions for debugging pourposes only let char_ucs2_to_utf8 = function n when n >= 0xD800 && n <= 0xDFFF -> assert false | n when n <= 0x007F -> [[n]] | n when n <= 0x07FF -> [[(n lsr 6 land 0b00011111 lor 0b11000000)] ; [(n land 0b00111111 lor 0b10000000)]] | n -> [[(n lsr 12 land 0b00001111 lor 0b11100000)] ; [(n lsr 6 land 0b00111111 lor 0b10000000)] ; [(n land 0b00111111 lor 0b10000000)]] ;; let rec bprint = function 0 -> "" | n -> bprint (n / 2) ^ string_of_int (n mod 2) ;; *) (* A few useful functions *) let rec mklist e = function 0 -> [] | n -> e::(mklist e (n - 1)) ;; let sup = let t = Uni_types.Char 0b10111111 in function 1 -> t | n -> Uni_types.Concat (mklist [t] n) ;; let rec inf = let b = Uni_types.Char 0b10000000 in function 1 -> [[b]] | n -> mklist [b] n ;; let mysucc = function [Uni_types.Char n] -> n + 1 | _ -> assert false ;; let mypred = function [Uni_types.Char n] -> n - 1 | _ -> assert false ;; (* Given two utf8-encoded extremes of an interval character code *) (* whose 'length' is the same, it returns the utf8 regular expression *) (* matching all the characters in the interval *) let rec same_length_ucs2_to_utf8 = let module T = Uni_types in function (* Trivial cases: *) (T.Char n, T.Char m) when n = m -> [T.Char n] | (T.Char n, T.Char m) -> [T.Interval (n,m)] (* Anchors of the recursion for 2-element concatations: *) | (T.Concat [hen ; [tln]], T.Concat [hem ; [tlm]]) when hen = hem -> [T.Concat [hen ; same_length_ucs2_to_utf8 (tln,tlm)]] | (T.Concat [hen ; [tln]], T.Concat ([hem ; [tlm]] as e2)) -> (T.Concat [hen ; same_length_ucs2_to_utf8 (tln,sup 1)]) :: (let shen = mysucc hen and phem = mypred hem in let succhen = [T.Char shen] in if succhen = hem then same_length_ucs2_to_utf8 (T.Concat (succhen::(inf 1)), T.Concat e2) else (T.Concat [[T.Interval (shen, phem)] ; [T.Interval (0b10000000,0b10111111)]]):: same_length_ucs2_to_utf8 (T.Concat (hem::(inf 1)), T.Concat e2) ) (*same_length_ucs2_to_utf8 (T.Concat ((mysucc hen)::(inf 1)), T.Concat e2)*) (* Reducing n-element concationations: *) | (T.Concat (hen::tln), T.Concat (hem::tlm)) when hen = hem -> [T.Concat [hen ; same_length_ucs2_to_utf8 (T.Concat tln, T.Concat tlm)]] | (T.Concat (hen::tln), T.Concat ((hem::tlm) as e2)) -> let n = List.length tln in (T.Concat [hen ; same_length_ucs2_to_utf8 (T.Concat tln,sup n)]) :: (let shen = mysucc hen and phem = mypred hem in let succhen = [T.Char shen] in if succhen = hem then same_length_ucs2_to_utf8 (T.Concat (succhen::(inf n)), T.Concat e2) else (T.Concat ([T.Interval (shen, phem)] :: mklist [T.Interval (0b10000000,0b10111111)] n) ):: same_length_ucs2_to_utf8 (T.Concat (hem::(inf n)), T.Concat e2) ) (*same_length_ucs2_to_utf8 (T.Concat ((mysucc hen)::(inf n)),T.Concat e2)*) | _ -> assert false ;; (* Given an interval of ucs2 characters, splits *) (* the list in subintervals whose extremes has *) (* the same utf8 encoding length and, for each *) (* extreme, calls same_length_ucs2_to_utf8 *) let rec seq_ucs2_to_utf8 = function (n,_) when n >= 0xD800 && n <= 0xDFFF -> raise SurrogatePairs | (_,n) when n >= 0xD800 && n <= 0xDFFF -> raise SurrogatePairs | (n,m) when n > m -> raise (InvalidInterval (n,m)) | (n,m) when n = m -> [char_ucs2_to_utf8 n] | (n,m) when n <= 0x07F && m > 0x07F -> (seq_ucs2_to_utf8 (n,0x07F)) @ (seq_ucs2_to_utf8 (0x080,m)) | (n,m) when n <= 0x07FF && m > 0x07FF -> (seq_ucs2_to_utf8 (n,0x07FF)) @ (seq_ucs2_to_utf8 (0x0800,m)) | (n,m) when n <= 0xffff && m > 0xffff -> (seq_ucs2_to_utf8 (n,0xFFFF)) @ (seq_ucs2_to_utf8 (0x10000,m)) | (n,m) -> let utf8n = char_ucs2_to_utf8 n and utf8m = char_ucs2_to_utf8 m in same_length_ucs2_to_utf8 (utf8n,utf8m) ;; (* simplify: For example, '\224'('\160'['\128'-'\191'] | ['\161'-'\190']['\128'-'\191'] | '\191'['\128'-'\191']) | can be simplified to '\224' ['\160'-'\191'] ['\128'-'\191'] *) let rec simplify_disjunction = function Uni_types.Char n1 :: Uni_types.Interval(n2,n3) :: rest when n1+1 = n2 -> simplify_disjunction(Uni_types.Interval(n2,n3) :: rest) | Uni_types.Interval(n1,n2) :: Uni_types.Interval(n3,n4) :: rest when n2+1 = n3 -> simplify_disjunction(Uni_types.Interval(n1,n4) :: rest) | Uni_types.Interval(n1,n2) :: Uni_types.Char n3 :: rest when n2+1 = n3 -> simplify_disjunction(Uni_types.Interval(n1,n3) :: rest) | Uni_types.Concat( [Uni_types.Char n1] :: tail1 ) :: Uni_types.Concat( [Uni_types.Interval(n2,n3)] :: tail2 ) :: rest when n1+1 = n2 && tail1 = tail2 -> simplify_disjunction( Uni_types.Concat( [Uni_types.Interval(n1,n3)] :: tail1 ) :: rest) | Uni_types.Concat( [Uni_types.Interval(n1,n2)] :: tail1 ) :: Uni_types.Concat( [Uni_types.Interval(n3,n4)] :: tail2 ) :: rest when n2+1 = n3 && tail1 = tail2 -> simplify_disjunction( Uni_types.Concat( [Uni_types.Interval(n1,n4)] :: tail1 ) :: rest) | Uni_types.Concat( [Uni_types.Interval(n1,n2)] :: tail1 ) :: Uni_types.Concat( [Uni_types.Char n3] :: tail2 ) :: rest when n2+1 = n3 && tail1 = tail2 -> simplify_disjunction( Uni_types.Concat( [Uni_types.Interval(n1,n3)] :: tail1 ) :: rest) | Uni_types.Concat([[x]]) :: rest -> simplify_disjunction(x :: rest) | Uni_types.Concat([Uni_types.Concat d] :: d') :: rest -> let d'' = List.map simplify_disjunction d' in simplify_disjunction(Uni_types.Concat(d @ d'') :: rest) (* there are probably missing cases!!! *) | Uni_types.Concat l :: rest -> let l' = List.map simplify_disjunction l in if l = l' then Uni_types.Concat l :: simplify_disjunction rest else simplify_disjunction(Uni_types.Concat l' :: simplify_disjunction rest) | x :: rest -> x :: (simplify_disjunction rest) | [] -> [] ;; let rec multi_simplify_disjunction l = let l' = simplify_disjunction l in if l = l' then l else multi_simplify_disjunction l' ;; (* Given an ucs2 regual expression, returns *) (* the corresponding utf8 regular expression *) let ucs2_to_utf8 { Uni_types.id = id ; Uni_types.rel = rel } = let rec aux re l2 = match re with Uni_types.Char i -> char_ucs2_to_utf8 i :: l2 | Uni_types.Interval (l,u) -> seq_ucs2_to_utf8 (l,u) @ l2 | Uni_types.Identifier _ as i -> i :: l2 | Uni_types.Concat rell -> let foo rel = List.fold_right aux rel [] in Uni_types.Concat (List.map foo rell) :: l2 in { Uni_types.id = id ; Uni_types.rel = multi_simplify_disjunction (List.fold_right aux rel []) } ;;