Skip to content

Commit

Permalink
Merge pull request #82 from edwintorok/private/edvint/format
Browse files Browse the repository at this point in the history
Reformat stdext
  • Loading branch information
robhoes committed Nov 29, 2023
2 parents 9443c20 + 93e2053 commit 6b8adab
Show file tree
Hide file tree
Showing 18 changed files with 1,135 additions and 840 deletions.
4 changes: 2 additions & 2 deletions lib/xapi-stdext-date/date.mli
Original file line number Diff line number Diff line change
Expand Up @@ -98,8 +98,8 @@ val of_string : string -> t
val never : t
(** Same as {!epoch} *)

type iso8601 = t
(** Deprecated alias for {!t} *)
type iso8601 = t

type rfc822 = t
(** Deprecated alias for {!t} *)
type rfc822 = t
36 changes: 18 additions & 18 deletions lib/xapi-stdext-encodings/bench/bechamel_simple_cli.ml
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,11 @@ let benchmark tests =

let analyze raw_results =
let ols =
Analyze.ols ~r_square:true ~bootstrap:0 ~predictors:[|Measure.run|]
Analyze.ols ~r_square:true ~bootstrap:0 ~predictors:[|Measure.run|]
in
let results =
List.map (fun instance -> Analyze.all ols instance raw_results) instances in
List.map (fun instance -> Analyze.all ols instance raw_results) instances
in
(Analyze.merge ols instances results, raw_results)

let () =
Expand All @@ -26,26 +27,25 @@ let img (window, results) =
open Notty_unix

let cli tests =
Format.printf "@,Running benchmarks@.";
Format.printf "@,Running benchmarks@." ;
let results, _ = tests |> benchmark |> analyze in

(* compute speed from duration *)
let () =
Hashtbl.find results (Measure.label Instance.monotonic_clock)
|> Hashtbl.iter @@ fun name result ->
try
(* this relies on extracting input size from test name,
which works if Test.make_indexed* was used *)
Scanf.sscanf name "%_s@:%d" @@ fun length ->
match Analyze.OLS.estimates result with
| Some [duration] ->
(* unit is ns *)
let speed = 1e9 *. float length /. duration /. 1048576.0 in
Fmt.pf Fmt.stdout "@[%s = %.1f MiB/s@]@." name speed
| _ -> ()
with Failure _ | Scanf.Scan_failure _ -> ()
Hashtbl.find results (Measure.label Instance.monotonic_clock)
|> Hashtbl.iter @@ fun name result ->
try
(* this relies on extracting input size from test name,
which works if Test.make_indexed* was used *)
Scanf.sscanf name "%_s@:%d" @@ fun length ->
match Analyze.OLS.estimates result with
| Some [duration] ->
(* unit is ns *)
let speed = 1e9 *. float length /. duration /. 1048576.0 in
Fmt.pf Fmt.stdout "@[%s = %.1f MiB/s@]@." name speed
| _ ->
()
with Failure _ | Scanf.Scan_failure _ -> ()
in

let window =
match winsize Unix.stdout with
| Some (w, h) ->
Expand Down
17 changes: 7 additions & 10 deletions lib/xapi-stdext-encodings/bench/bench_encodings.ml
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,14 @@ open Bechamel
open Xapi_stdext_encodings.Encodings

let test name f =
Test.make_indexed_with_resource ~name
~args:[10; 1000; 10000]
Test.multiple (* TODO: Test.uniq segfaults here, bechamel bug *)
~allocate:(fun i -> String.make i 'x')
~free:ignore
(fun (_:int) -> Staged.stage f)
Test.make_indexed_with_resource ~name ~args:[10; 1000; 10000]
Test.multiple (* TODO: Test.uniq segfaults here, bechamel bug *)
~allocate:(fun i -> String.make i 'x')
~free:ignore
(fun (_ : int) -> Staged.stage f)

let benchmarks =
Test.make_grouped ~name:"Encodings.validate"
[ test "UTF8_XML" UTF8_XML.validate
]
[test "UTF8_XML" UTF8_XML.validate]

let () =
Bechamel_simple_cli.cli benchmarks
let () = Bechamel_simple_cli.cli benchmarks
143 changes: 76 additions & 67 deletions lib/xapi-stdext-encodings/encodings.ml
Original file line number Diff line number Diff line change
Expand Up @@ -12,147 +12,156 @@
* GNU Lesser General Public License for more details.
*)
exception UCS_value_out_of_range

exception UCS_value_prohibited_in_UTF8

exception UCS_value_prohibited_in_XML

exception UTF8_character_incomplete

exception UTF8_header_byte_invalid

exception UTF8_continuation_byte_invalid

exception UTF8_encoding_not_canonical

exception String_incomplete

(* === Unicode Functions === *)

module UCS = struct

let is_non_character value = false
|| (0xfdd0 <= value && value <= 0xfdef) (* case 1 *)
|| (Int.logand 0xfffe value = 0xfffe) (* case 2 *)
[@@inline]

let is_non_character value =
false
|| (0xfdd0 <= value && value <= 0xfdef) (* case 1 *)
|| Int.logand 0xfffe value = 0xfffe
(* case 2 *)
[@@inline]
end

module XML = struct

let is_illegal_control_character value = let value = Uchar.to_int value in
value < 0x20
&& value <> 0x09
&& value <> 0x0a
&& value <> 0x0d
[@@inline]

let is_illegal_control_character value =
let value = Uchar.to_int value in
value < 0x20 && value <> 0x09 && value <> 0x0a && value <> 0x0d
[@@inline]
end

(* === UCS Validators === *)

module type UCS_VALIDATOR = sig

val validate : Uchar.t -> unit [@@inline]

end

module UTF8_UCS_validator = struct

let validate value =
if (UCS.is_non_character[@inlined]) (Uchar.to_int value) then raise UCS_value_prohibited_in_UTF8
if (UCS.is_non_character [@inlined]) (Uchar.to_int value) then
raise UCS_value_prohibited_in_UTF8
[@@inline]

end

module XML_UTF8_UCS_validator = struct

let validate value =
(UTF8_UCS_validator.validate[@inlined]) value;
if (XML.is_illegal_control_character[@inlined]) value
then raise UCS_value_prohibited_in_XML

(UTF8_UCS_validator.validate [@inlined]) value ;
if (XML.is_illegal_control_character [@inlined]) value then
raise UCS_value_prohibited_in_XML
end

(* === String Validators === *)

module type STRING_VALIDATOR = sig

val is_valid : string -> bool

val validate : string -> unit
val longest_valid_prefix : string -> string

val longest_valid_prefix : string -> string
end

exception Validation_error of int * exn

module UTF8_XML : STRING_VALIDATOR = struct

let decode_continuation_byte byte =
if byte land 0b11000000 = 0b10000000 then byte land 0b00111111 else
if byte land 0b11000000 = 0b10000000 then
byte land 0b00111111
else
raise UTF8_continuation_byte_invalid

let rec decode_continuation_bytes string last value index =
if index <= last then
let chunk = decode_continuation_byte (Char.code string.[index]) in
let value = (value lsl 6) lor chunk in
decode_continuation_bytes string last value (index + 1)
else value
else
value

let validate_character_utf8 string byte index =
let value, width =
if byte land 0b10000000 = 0b00000000 then (byte, 1) else
if byte land 0b11100000 = 0b11000000 then (byte land 0b0011111, 2) else
if byte land 0b11110000 = 0b11100000 then (byte land 0b0001111, 3) else
if byte land 0b11111000 = 0b11110000 then (byte land 0b0000111, 4) else
if byte land 0b10000000 = 0b00000000 then
(byte, 1)
else if byte land 0b11100000 = 0b11000000 then
(byte land 0b0011111, 2)
else if byte land 0b11110000 = 0b11100000 then
(byte land 0b0001111, 3)
else if byte land 0b11111000 = 0b11110000 then
(byte land 0b0000111, 4)
else
raise UTF8_header_byte_invalid
in
let value =
if width = 1 then value
else decode_continuation_bytes string (index+width-1) value (index+1)
if width = 1 then
value
else
decode_continuation_bytes string (index + width - 1) value (index + 1)
in
XML_UTF8_UCS_validator.validate (Uchar.unsafe_of_int value);
XML_UTF8_UCS_validator.validate (Uchar.unsafe_of_int value) ;
width

let rec validate_aux string length index =
if index = length then ()
if index = length then
()
else
let width =
try
let byte = string.[index] |> Char.code in
validate_character_utf8 string byte index
with
| Invalid_argument _ -> raise String_incomplete
| error -> raise (Validation_error(index, error))
in
validate_aux string length (index + width)

let validate string =
validate_aux string (String.length string) 0
let width =
try
let byte = string.[index] |> Char.code in
validate_character_utf8 string byte index
with
| Invalid_argument _ ->
raise String_incomplete
| error ->
raise (Validation_error (index, error))
in
validate_aux string length (index + width)

let validate string = validate_aux string (String.length string) 0

let rec validate_with_fastpath string stop pos =
if pos < stop then
(* the compiler is smart enough to optimize the 'int32' away here,
and not allocate *)
let i32 = String.get_int32_ne string pos |> Int32.to_int in
(* test that for all bytes 0x20 <= byte < 0x80.
(* the compiler is smart enough to optimize the 'int32' away here,
and not allocate *)
let i32 = String.get_int32_ne string pos |> Int32.to_int in
(* test that for all bytes 0x20 <= byte < 0x80.
If any is <0x20 it would cause a negative value to appear in that byte,
which we can detect if we use 0x80 as a mask.
Byte >= 0x80 can be similarly detected with a mask of 0x80 on each byte.
We don't want to see a 0x80 from either of these, hence we bitwise or the 2 values together.
*)
if (i32 lor (i32 - 0x20_20_20_20)) land 0x80_80_80_80 = 0 then
validate_with_fastpath string stop (pos + 4)
else (* when the condition doesn't hold fall back to full UTF8 decoder *)
validate_aux string (String.length string) pos
else
*)
if i32 lor (i32 - 0x20_20_20_20) land 0x80_80_80_80 = 0 then
validate_with_fastpath string stop (pos + 4)
else (* when the condition doesn't hold fall back to full UTF8 decoder *)
validate_aux string (String.length string) pos
else
validate_aux string (String.length string) pos

let validate_with_fastpath string =
validate_with_fastpath string (String.length string - 3) 0
validate_with_fastpath string (String.length string - 3) 0

let validate =
if Sys.word_size = 64 then validate_with_fastpath
else validate
if Sys.word_size = 64 then
validate_with_fastpath
else
validate

let is_valid string =
try validate string; true with _ -> false
let is_valid string = try validate string ; true with _ -> false

let longest_valid_prefix string =
try validate string; string
try validate string ; string
with Validation_error (index, _) -> String.sub string 0 index

end
18 changes: 11 additions & 7 deletions lib/xapi-stdext-encodings/encodings.mli
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,20 @@
(** {2 Exceptions} *)

exception UCS_value_out_of_range

exception UCS_value_prohibited_in_UTF8

exception UCS_value_prohibited_in_XML

exception UTF8_character_incomplete

exception UTF8_header_byte_invalid

exception UTF8_continuation_byte_invalid

exception UTF8_encoding_not_canonical
exception String_incomplete

exception String_incomplete

(** {2 UCS Validators} *)

Expand All @@ -38,27 +44,25 @@ end
module XML_UTF8_UCS_validator : UCS_VALIDATOR

module XML : sig
val is_illegal_control_character : Uchar.t -> bool
(** Returns true if and only if the given value corresponds to
* a illegal control character as defined in section 2.2 of
* the XML specification, version 1.0. *)
val is_illegal_control_character : Uchar.t -> bool
end

(** {2 String Validators} *)

(** Provides functionality for validating and processing
* strings according to a particular character encoding. *)
module type STRING_VALIDATOR = sig

(** Returns true if and only if the given string is validly-encoded. *)
val is_valid : string -> bool
(** Returns true if and only if the given string is validly-encoded. *)

val validate : string -> unit
(** Raises an encoding error if the given string is not validly-encoded. *)
val validate: string -> unit

(** Returns the longest validly-encoded prefix of the given string. *)
val longest_valid_prefix : string -> string

(** Returns the longest validly-encoded prefix of the given string. *)
end

(** Represents a validation error as a tuple [(i,e)], where:
Expand Down
Loading

0 comments on commit 6b8adab

Please sign in to comment.