diff --git a/src/Collections/Sequences/Seq.dfy b/src/Collections/Sequences/Seq.dfy index 815970b9..0a04b969 100644 --- a/src/Collections/Sequences/Seq.dfy +++ b/src/Collections/Sequences/Seq.dfy @@ -90,6 +90,26 @@ module {:options "-functionSyntax:4"} Seq { * ***********************************************************/ + /* If a predicate is true at every index of a sequence, + it is true for every member of the sequence as a collection. + Useful for converting quantifiers between the two forms + to satisfy a precondition in the latter form. */ + lemma IndexingImpliesMembership(p: T -> bool, xs: seq) + requires forall i | 0 <= i < |xs| :: p(xs[i]) + ensures forall t | t in xs :: p(t) + { + } + + /* If a predicate is true for every member of a sequence as a collection, + it is true at every index of the sequence. + Useful for converting quantifiers between the two forms + to satisfy a precondition in the latter form. */ + lemma MembershipImpliesIndexing(p: T -> bool, xs: seq) + requires forall t | t in xs :: p(t) + ensures forall i | 0 <= i < |xs| :: p(xs[i]) + { + } + /* Is true if the sequence xs is a prefix of the sequence ys. */ ghost predicate IsPrefix(xs: seq, ys: seq) ensures IsPrefix(xs, ys) ==> (|xs| <= |ys| && xs == ys[..|xs|]) @@ -603,8 +623,12 @@ module {:options "-functionSyntax:4"} Seq { ensures forall i {:trigger result[i]} :: 0 <= i < |xs| ==> result[i] == f(xs[i]); reads set i, o | 0 <= i < |xs| && o in f.reads(xs[i]) :: o { - if |xs| == 0 then [] - else [f(xs[0])] + Map(f, xs[1..]) + // This uses a sequence comprehension because it will usually be + // more efficient when compiled, allocating the storage for |xs| elements + // once instead of creating a chain of |xs| single element concatenations. + seq(|xs|, i requires 0 <= i < |xs| && f.requires(xs[i]) + reads set i,o | 0 <= i < |xs| && o in f.reads(xs[i]) :: o + => f(xs[i])) } /* Applies a function to every element of a sequence, returning a Result value (which is a @@ -803,6 +827,26 @@ module {:options "-functionSyntax:4"} Seq { } } + /* Optimized implementation of Flatten(Map(f, xs)). */ + function {:opaque} FlatMap(f: (T ~> seq), xs: seq): (result: seq) + requires forall i :: 0 <= i < |xs| ==> f.requires(xs[i]) + ensures result == Flatten(Map(f, xs)); + reads set i, o | 0 <= i < |xs| && o in f.reads(xs[i]) :: o + { + Flatten(Map(f, xs)) + } + by method { + result := []; + ghost var unflattened: seq> := []; + for i := |xs| downto 0 + invariant unflattened == Map(f, xs[i..]) + invariant result == Flatten(unflattened) + { + var next := f(xs[i]); + unflattened := [next] + unflattened; + result := next + result; + } + } /********************************************************** * diff --git a/src/JSON/Deserializer.dfy b/src/JSON/Deserializer.dfy index 07b359a6..32155a4b 100644 --- a/src/JSON/Deserializer.dfy +++ b/src/JSON/Deserializer.dfy @@ -11,7 +11,6 @@ include "../BoundedInts.dfy" include "Utils/Views.dfy" include "Utils/Vectors.dfy" -include "Utils/Unicode.dfy" include "Errors.dfy" include "AST.dfy" include "Grammar.dfy" @@ -25,7 +24,7 @@ module {:options "-functionSyntax:4"} JSON.Deserializer { import opened Logarithm import opened Power import opened Utils.Str - import Utils.Unicode + import opened UnicodeStrings import AST import Spec @@ -39,60 +38,81 @@ module {:options "-functionSyntax:4"} JSON.Deserializer { js.At(0) == 't' as byte } + function UnsupportedEscape16(code: seq): DeserializationError { + UnsupportedEscape(FromUTF16Checked(code).UnwrapOr("Couldn't decode UTF-16")) + } + + module Uint16StrConversion refines Str.ParametricConversion { + import opened BoundedInts + + type Char = uint16 + } + + const HEX_TABLE_16: map := + map[ + '0' as uint16 := 0, '1' as uint16 := 1, '2' as uint16 := 2, '3' as uint16 := 3, '4' as uint16 := 4, + '5' as uint16 := 5, '6' as uint16 := 6, '7' as uint16 := 7, '8' as uint16 := 8, '9' as uint16 := 9, + 'a' as uint16 := 0xA, 'b' as uint16 := 0xB, 'c' as uint16 := 0xC, 'd' as uint16 := 0xD, 'e' as uint16 := 0xE, 'f' as uint16 := 0xF, + 'A' as uint16 := 0xA, 'B' as uint16 := 0xB, 'C' as uint16 := 0xC, 'D' as uint16 := 0xD, 'E' as uint16 := 0xE, 'F' as uint16 := 0xF + ] + + function ToNat16(str: Uint16StrConversion.String): uint16 + requires |str| <= 4 + requires forall c | c in str :: c in HEX_TABLE_16 + { + Uint16StrConversion.ToNat_bound(str, 16, HEX_TABLE_16); + var hd := Uint16StrConversion.ToNat_any(str, 16, HEX_TABLE_16); + assert hd < 0x1_0000 by { reveal Pow(); } + hd as uint16 + } + // TODO: Verify this function - function Unescape(str: string, start: nat := 0): DeserializationResult + function Unescape(str: seq, start: nat := 0): DeserializationResult> decreases |str| - start { // Assumes UTF-16 strings if start >= |str| then Success([]) - else if str[start] == '\\' then + else if str[start] == '\\' as uint16 then if |str| == start + 1 then Failure(EscapeAtEOS) else var c := str[start + 1]; - if c == 'u' then + if c == 'u' as uint16 then if |str| <= start + 6 then Failure(EscapeAtEOS) else var code := str[start + 2..start + 6]; - if exists c | c in code :: c !in Str.HEX_TABLE then - Failure(UnsupportedEscape(code)) + if exists c | c in code :: c !in HEX_TABLE_16 then + Failure(UnsupportedEscape16(code)) else var tl :- Unescape(str, start + 6); - var hd := Str.ToNat(code, 16); - assert hd < 0x10000 by { reveal Pow(); } - if 0xD7FF < hd then - Failure(UnsupportedEscape(code)) - else - Success([hd as char]) - + var hd := ToNat16(code); + Success([hd]) else var unescaped: uint16 := match c - case '\"' => 0x22 as uint16 // quotation mark - case '\\' => 0x5C as uint16 // reverse solidus - case 'b' => 0x08 as uint16 // backspace - case 'f' => 0x0C as uint16 // form feed - case 'n' => 0x0A as uint16 // line feed - case 'r' => 0x0D as uint16 // carriage return - case 't' => 0x09 as uint16 // tab + case 0x22 => 0x22 as uint16 // \" => quotation mark + case 0x5C => 0x5C as uint16 // \\ => reverse solidus + case 0x62 => 0x08 as uint16 // \b => backspace + case 0x66 => 0x0C as uint16 // \f => form feed + case 0x6E => 0x0A as uint16 // \n => line feed + case 0x72 => 0x0D as uint16 // \r => carriage return + case 0x74 => 0x09 as uint16 // \t => tab case _ => 0 as uint16; - if unescaped == 0 as uint16 then - Failure(UnsupportedEscape(str[start..start+2])) + if unescaped as int == 0 then + Failure(UnsupportedEscape16(str[start..start+2])) else var tl :- Unescape(str, start + 2); - Success([unescaped as char] + tl) + Success([unescaped] + tl) else var tl :- Unescape(str, start + 1); Success([str[start]] + tl) } - function Transcode8To16Unescaped(str: seq): DeserializationResult - // TODO Optimize with a function by method - { - Unescape(Unicode.Transcode8To16(str)) - } - function String(js: Grammar.jstring): DeserializationResult { - Transcode8To16Unescaped(js.contents.Bytes()) + // TODO Optimize with a function by method + var asUtf32 :- FromUTF8Checked(js.contents.Bytes()).ToResult'(DeserializationError.InvalidUnicode); + var asUint16 :- ToUTF16Checked(asUtf32).ToResult'(DeserializationError.InvalidUnicode); + var unescaped :- Unescape(asUint16); + FromUTF16Checked(unescaped).ToResult'(DeserializationError.InvalidUnicode) } module ByteStrConversion refines Str.ParametricConversion { diff --git a/src/JSON/Errors.dfy b/src/JSON/Errors.dfy index d8aae9db..891cffc3 100644 --- a/src/JSON/Errors.dfy +++ b/src/JSON/Errors.dfy @@ -19,6 +19,7 @@ module {:options "-functionSyntax:4"} JSON.Errors { | ReachedEOF | ExpectingByte(expected: byte, b: opt_byte) | ExpectingAnyByte(expected_sq: seq, b: opt_byte) + | InvalidUnicode { function ToString() : string { match this @@ -36,6 +37,7 @@ module {:options "-functionSyntax:4"} JSON.Errors { var c := if b > 0 then "'" + [b as char] + "'" else "EOF"; var c0s := seq(|bs0|, idx requires 0 <= idx < |bs0| => bs0[idx] as char); "Expecting one of '" + c0s + "', read " + c + case InvalidUnicode => "Invalid Unicode sequence" } } @@ -43,12 +45,14 @@ module {:options "-functionSyntax:4"} JSON.Errors { | OutOfMemory | IntTooLarge(i: int) | StringTooLong(s: string) + | InvalidUnicode { function ToString() : string { match this case OutOfMemory => "Out of memory" case IntTooLarge(i: int) => "Integer too large: " + Str.OfInt(i) case StringTooLong(s: string) => "String too long: " + s + case InvalidUnicode => "Invalid Unicode sequence" } } diff --git a/src/JSON/Serializer.dfy b/src/JSON/Serializer.dfy index b0d65933..c1f364ec 100644 --- a/src/JSON/Serializer.dfy +++ b/src/JSON/Serializer.dfy @@ -12,7 +12,6 @@ include "../Math.dfy" include "Utils/Views.dfy" include "Utils/Vectors.dfy" -include "Utils/Unicode.dfy" include "Errors.dfy" include "AST.dfy" include "Grammar.dfy" @@ -24,7 +23,6 @@ module {:options "-functionSyntax:4"} JSON.Serializer { import opened Wrappers import opened BoundedInts import opened Utils.Str - import Utils.Unicode import AST import Spec @@ -43,37 +41,12 @@ module {:options "-functionSyntax:4"} JSON.Serializer { View.OfBytes(if b then TRUE else FALSE) } - function Transcode16To8Escaped(str: string, start: uint32 := 0): bytes { - Unicode.Transcode16To8(Spec.Escape(str)) - } // FIXME speed up using a `by method` - // by method { - // var len := |str| as uint32; - // if len == 0 { - // return []; - // } - // var st := new Vectors.Vector(0, len); - // var c0: uint16 := 0; - // var c1: uint16 := str[0] as uint16; - // var idx: uint32 := 0; - // while idx < len { - // var c0 := c1; - // var c1 := str[idx + 1] as uint16; - // if c0 < 0xD800 || c0 > 0xDBFF { - // Utf8Encode(st, Unicode.Utf16Decode1(c0)); - // idx := idx +1; - // } else { - // Utf8Encode(st, Unicode.Utf16Decode2(c0, c1)); - // idx := idx + 2; - // } - // } - // } - function CheckLength(s: seq, err: SerializationError): Outcome { Need(|s| < TWO_TO_THE_32, err) } function String(str: string): Result { - var bs := Transcode16To8Escaped(str); + var bs :- Spec.EscapeToUTF8(str); :- CheckLength(bs, StringTooLong(str)); Success(Grammar.JString(Grammar.DOUBLEQUOTE, View.OfBytes(bs), Grammar.DOUBLEQUOTE)) } diff --git a/src/JSON/Spec.dfy b/src/JSON/Spec.dfy index 33cd3ff8..f4b06087 100644 --- a/src/JSON/Spec.dfy +++ b/src/JSON/Spec.dfy @@ -9,9 +9,12 @@ include "../BoundedInts.dfy" include "../NonlinearArithmetic/Logarithm.dfy" +include "../Collections/Sequences/Seq.dfy" +// TODO: Remove and follow one of the options documented in UnicodeStrings.dfy +include "../Unicode/UnicodeStringsWithoutUnicodeChar.dfy" include "AST.dfy" -include "Utils/Unicode.dfy" +include "Errors.dfy" include "Utils/Str.dfy" module {:options "-functionSyntax:4"} JSON.Spec { @@ -19,13 +22,20 @@ module {:options "-functionSyntax:4"} JSON.Spec { import opened Utils.Str import opened AST - import opened Utils.Unicode + import opened Wrappers + import opened Errors + import opened UnicodeStrings import opened Logarithm + import Seq + type bytes = seq + type Result<+T> = SerializationResult - function EscapeUnicode(c: uint16): string { - var s := Str.OfNat(c as nat, 16); + function EscapeUnicode(c: uint16): seq { + var sStr := Str.OfNat(c as nat, 16); + Seq.MembershipImpliesIndexing(c => 0 <= c as int < 128, sStr); + var s := ASCIIToUTF16(sStr); assert |s| <= 4 by { assert c as nat <= 0xFFFF; assert Log(16, c as nat) <= Log(16, 0xFFFF) by { @@ -33,71 +43,101 @@ module {:options "-functionSyntax:4"} JSON.Spec { } assert Log(16, 0xFFFF) == 3 by { reveal Log(); } } - s + seq(4 - |s|, _ => ' ') + s + seq(4 - |s|, _ => ' ' as uint16) } - function Escape(str: string, start: nat := 0): string + function Escape(str: seq, start: nat := 0): seq decreases |str| - start { if start >= |str| then [] else - (match str[start] as uint16 - case 0x22 => "\\\"" // quotation mark - case 0x5C => "\\\\" // reverse solidus - case 0x08 => "\\b" // backspace - case 0x0C => "\\f" // form feed - case 0x0A => "\\n" // line feed - case 0x0D => "\\r" // carriage return - case 0x09 => "\\t" // tab + (match str[start] + case 0x22 => ASCIIToUTF16("\\\"") // quotation mark + case 0x5C => ASCIIToUTF16("\\\\") // reverse solidus + case 0x08 => ASCIIToUTF16("\\b") // backspace + case 0x0C => ASCIIToUTF16("\\f") // form feed + case 0x0A => ASCIIToUTF16("\\n") // line feed + case 0x0D => ASCIIToUTF16("\\r") // carriage return + case 0x09 => ASCIIToUTF16("\\t") // tab case c => - if c < 0x001F then "\\u" + EscapeUnicode(c) - else [str[start]]) - + Escape(str, start + 1) + if c < 0x001F then ASCIIToUTF16("\\u") + EscapeUnicode(c) + else [str[start]]) + + Escape(str, start + 1) + } + + function EscapeToUTF8(str: string, start: nat := 0): Result { + var utf16 :- ToUTF16Checked(str).ToResult'(SerializationError.InvalidUnicode); + var escaped := Escape(utf16); + var utf32 :- FromUTF16Checked(escaped).ToResult'(SerializationError.InvalidUnicode); + ToUTF8Checked(utf32).ToResult'(SerializationError.InvalidUnicode) } - function ToBytes(s: string) : seq - requires forall c: char | c in s :: c as int < 256 + // Can fail due to invalid UTF-16 sequences in a string when --unicode-char is off + function String(str: string): Result { + var inBytes :- EscapeToUTF8(str); + Success(ASCIIToUTF8("\"") + inBytes + ASCIIToUTF8("\"")) + } + + lemma OfIntOnlyASCII(n: int) + ensures + && var s := Str.OfInt(n); + && forall i | 0 <= i < |s| :: 0 <= s[i] as int < 128 { - seq(|s|, i requires 0 <= i < |s| => - assert s[i] in s; s[i] as byte) + var s := Str.OfInt(n); + forall i | 0 <= i < |s| ensures 0 <= s[i] as int < 128 { + if i == 0 { + } else { + var isHexDigit := c => c in HEX_DIGITS; + assert CharStrConversion.NumberStr(s, '-', isHexDigit); + assert isHexDigit(s[i]); + } + } } - function String(str: string): bytes { - ToBytes("\"") + Transcode16To8(Escape(str)) + ToBytes("\"") + function IntToBytes(n: int): bytes { + var s := Str.OfInt(n); + OfIntOnlyASCII(n); + ASCIIToUTF8(s) } - function Number(dec: Decimal): bytes { - Transcode16To8(Str.OfInt(dec.n)) + + function Number(dec: Decimal): Result { + Success(IntToBytes(dec.n) + (if dec.e10 == 0 then [] - else ToBytes("e") + Transcode16To8(Str.OfInt(dec.e10))) + else ASCIIToUTF8("e") + IntToBytes(dec.e10))) } - function KeyValue(kv: (string, JSON)): bytes { - String(kv.0) + ToBytes(":") + JSON(kv.1) + function KeyValue(kv: (string, JSON)): Result { + var key :- String(kv.0); + var value :- JSON(kv.1); + Success(key + ASCIIToUTF8(":") + value) } - function Join(sep: bytes, items: seq): bytes { - if |items| == 0 then [] - else if |items| == 1 then items[0] - else items[0] + sep + Join(sep, items[1..]) + function Join(sep: bytes, items: seq>): Result { + if |items| == 0 then + Success([]) + else + var first :- items[0]; + if |items| == 1 then + Success(first) + else + var rest :- Join(sep, items[1..]); + Success(first + sep + rest) } - function Object(obj: seq<(string, JSON)>): bytes { - ToBytes("{") + - Join(ToBytes(","), seq(|obj|, i requires 0 <= i < |obj| => KeyValue(obj[i]))) + - ToBytes("}") + function Object(obj: seq<(string, JSON)>): Result { + var middle :- Join(ASCIIToUTF8(","), seq(|obj|, i requires 0 <= i < |obj| => KeyValue(obj[i]))); + Success(ASCIIToUTF8("{") + middle + ASCIIToUTF8("}")) } - function Array(arr: seq): bytes { - ToBytes("[") + - Join(ToBytes(","), seq(|arr|, i requires 0 <= i < |arr| => JSON(arr[i]))) + - ToBytes("]") + function Array(arr: seq): Result { + var middle :- Join(ASCIIToUTF8(","), seq(|arr|, i requires 0 <= i < |arr| => JSON(arr[i]))); + Success(ASCIIToUTF8("[") + middle + ASCIIToUTF8("]")) } - function JSON(js: JSON): bytes { + function JSON(js: JSON): Result { match js - case Null => ToBytes("null") - case Bool(b) => if b then ToBytes("true") else ToBytes("false") + case Null => Success(ASCIIToUTF8("null")) + case Bool(b) => Success(if b then ASCIIToUTF8("true") else ASCIIToUTF8("false")) case String(str) => String(str) case Number(dec) => Number(dec) case Object(obj) => Object(obj) diff --git a/src/JSON/Tests.dfy b/src/JSON/Tests.dfy index 255cfd9c..32044edc 100644 --- a/src/JSON/Tests.dfy +++ b/src/JSON/Tests.dfy @@ -1,6 +1,5 @@ // RUN: %run "%s" -include "Utils/Unicode.dfy" include "Errors.dfy" include "API.dfy" include "ZeroCopy/API.dfy" @@ -8,9 +7,8 @@ include "../Collections/Sequences/Seq.dfy" abstract module JSON.Tests.Wrapper { import Utils.Str - import Utils.Unicode import opened BoundedInts - + import opened UnicodeStrings import opened Errors type JSON @@ -23,15 +21,15 @@ abstract module JSON.Tests.Wrapper { var js :- expect Deserialize(bs); // print indent, "=> ", js, "\n"; var bs' :- expect Serialize(js); - print indent, "=> " + Unicode.Transcode8To16(bs') + "\n"; + print indent, "=> ", FromUTF8Checked(bs'), "\n"; var sbs' :- expect SpecSerialize(js); - print indent, "=> " + Unicode.Transcode8To16(sbs') + "\n"; + print indent, "=> ", FromUTF8Checked(sbs'), "\n"; var js' :- expect Deserialize(bs'); Check(bs, js, bs', sbs', js'); } method TestString(str: string, indent: string) { - var bs := Unicode.Transcode16To8(str); + var bs :- expect ToUTF8Checked(str); TestBytestring(bs, indent); } @@ -92,7 +90,7 @@ module JSON.Tests.AbstractSyntaxWrapper refines Wrapper { } method SpecSerialize(js: JSON) returns (bs: SerializationResult) { - bs := Success(Spec.JSON(js)); + bs := Spec.JSON(js); } method Check(bs: bytes, js: JSON, bs': bytes, sbs': bytes, js': JSON) { diff --git a/src/JSON/Tutorial.dfy b/src/JSON/Tutorial.dfy index a87384ce..e2ea3e97 100644 --- a/src/JSON/Tutorial.dfy +++ b/src/JSON/Tutorial.dfy @@ -12,9 +12,9 @@ include "ZeroCopy/API.dfy" module {:options "-functionSyntax:4"} JSON.Examples.AbstractSyntax { import API - import Utils.Unicode import opened AST import opened Wrappers + import opened UnicodeStrings /// The high-level API works with fairly simple ASTs that contain native Dafny /// strings: @@ -28,14 +28,14 @@ module {:options "-functionSyntax:4"} JSON.Examples.AbstractSyntax { /// have syntax for byte strings; in a real application, we would be reading and /// writing raw bytes directly from disk or from the network instead). - var SIMPLE_JS := Unicode.Transcode16To8("[true]"); + var SIMPLE_JS :- expect ToUTF8Checked("[true]"); var SIMPLE_AST := Array([Bool(true)]); expect API.Deserialize(SIMPLE_JS) == Success(SIMPLE_AST); /// Here is a larger object, written using a verbatim string (with `@"`). In /// verbatim strings `""` represents a single double-quote character): - var CITIES_JS := Unicode.Transcode16To8(@"{ + var CITIES_JS :- expect ToUTF8Checked(@"{ ""Cities"": [ { ""Name"": ""Boston"", @@ -87,7 +87,7 @@ module {:options "-functionSyntax:4"} JSON.Examples.AbstractSyntax { /// For more complex object, the generated layout may not be exactly the same; note in particular how the representation of numbers and the whitespace have changed. - var EXPECTED := Unicode.Transcode16To8( + var EXPECTED :- expect ToUTF8Checked( @"{""Cities"":[{""Name"":""Boston"",""Founded"":1630,""Population"":689386,""Area (km2)"":45842e-1},{""Name"":""Rome"",""Founded"":-753,""Population"":2873e3,""Area (km2)"":1285},{""Name"":""Paris"",""Founded"":null,""Population"":2161e3,""Area (km2)"":23835e-1}]}" ); @@ -97,7 +97,7 @@ module {:options "-functionSyntax:4"} JSON.Examples.AbstractSyntax { /// existing buffer or into an array. Below is the smaller example from the /// README, as a sanity check: - var CITY_JS := Unicode.Transcode16To8(@"{""Cities"": [{ + var CITY_JS :- expect ToUTF8Checked(@"{""Cities"": [{ ""Name"": ""Boston"", ""Founded"": 1630, ""Population"": 689386, @@ -113,7 +113,7 @@ module {:options "-functionSyntax:4"} JSON.Examples.AbstractSyntax { expect API.Deserialize(CITY_JS) == Success(CITY_AST); - var EXPECTED' := Unicode.Transcode16To8( + var EXPECTED' :- expect ToUTF8Checked( @"{""Cities"":[{""Name"":""Boston"",""Founded"":1630,""Population"":689386,""Area (km2)"":45842e-1}]}" ); @@ -128,7 +128,7 @@ module {:options "-functionSyntax:4"} JSON.Examples.AbstractSyntax { module {:options "-functionSyntax:4"} JSON.Examples.ConcreteSyntax { import ZeroCopy.API - import Utils.Unicode + import opened UnicodeStrings import opened Grammar import opened Wrappers @@ -145,7 +145,7 @@ module {:options "-functionSyntax:4"} JSON.Examples.ConcreteSyntax { /// all formatting information, re-serializing an object produces the original /// value: - var CITIES := Unicode.Transcode16To8(@"{ + var CITIES :- expect ToUTF8Checked(@"{ ""Cities"": [ { ""Name"": ""Boston"", @@ -176,8 +176,8 @@ module {:options "-functionSyntax:4"} JSON.Examples.ConcreteSyntax { /// First, we construct a JSON value for the string `"Unknown"`; this could be /// done by hand using `View.OfBytes()`, but using `API.Deserialize` is even /// simpler: - - var UNKNOWN :- expect API.Deserialize(Unicode.Transcode16To8(@"""Unknown""")); + var UNKNOWN_JS :- expect ToUTF8Checked(@"""Unknown"""); + var UNKNOWN :- expect API.Deserialize(UNKNOWN_JS); /// `UNKNOWN` is of type `Grammar.JSON`, which contains optional whitespace and /// a `Grammar.Value` under the name `UNKNOWN.t`, which we can use in the @@ -188,7 +188,7 @@ module {:options "-functionSyntax:4"} JSON.Examples.ConcreteSyntax { /// Then, if we reserialize, we see that all formatting (and, in fact, all of /// the serialization work) has been reused: - expect API.Serialize(without_null) == Success(Unicode.Transcode16To8(@"{ + var expected_js :- expect ToUTF8Checked(@"{ ""Cities"": [ { ""Name"": ""Boston"", @@ -207,7 +207,9 @@ module {:options "-functionSyntax:4"} JSON.Examples.ConcreteSyntax { ""Area (km2)"": 2383.5 } ] - }")); + }"); + var actual_js :- expect API.Serialize(without_null); + expect actual_js == expected_js; } /// All that remains is to write the recursive traversal: diff --git a/src/JSON/Utils/Unicode.dfy b/src/JSON/Utils/Unicode.dfy deleted file mode 100644 index 872d85e3..00000000 --- a/src/JSON/Utils/Unicode.dfy +++ /dev/null @@ -1,160 +0,0 @@ -// RUN: %verify "%s" - -include "../../BoundedInts.dfy" - -// TODO: This module was written before Dafny got a Unicode library. It would -// be better to combine the two, especially given that the Unicode library has -// proofs! - -module {:options "-functionSyntax:4"} JSON.Utils.Unicode { - import opened BoundedInts - - function Utf16Decode1(c: uint16): uint32 - requires c < 0xD800 || 0xDBFF < c - { - c as uint32 - } - - function Utf16Decode2(c0: uint16, c1: uint16): uint32 - requires 0xD800 <= c0 as int <= 0xDBFF - { - (0x10000 - + ((((c0 as bv32) & 0x03FF) << 10) - | ((c1 as bv32) & 0x03FF))) - as uint32 - } - - newtype opt_uint16 = c: int | -1 <= c as int < TWO_TO_THE_16 - - function Utf16DecodeChars(c0: uint16, c1: opt_uint16): (r: (uint32, uint8)) - ensures r.1 in {1, 2} - ensures c1 == -1 ==> r.1 == 1 - { - if c0 < 0xD800 || 0xDBFF < c0 then - (Utf16Decode1(c0), 1) - else if c1 >= 0 then - (Utf16Decode2(c0, c1 as uint16), 2) - else - (0xFFFD, 1) // Replacement character - } - - function Utf16Decode(str: string, start: nat := 0): seq - decreases |str| - start - { - if start >= |str| then [] - else - var c0 := str[start] as uint16; - var c1 := if |str| > start + 1 then str[start + 1] as opt_uint16 else -1; - var (cp, delta) := Utf16DecodeChars(c0, c1); - [cp] + Utf16Decode(str, start + delta as nat) - } - - function Utf16Encode2(cp: uint32): seq - requires cp < 0x100000 - { - var bv := cp as bv20; - [(0xD800 | (bv >> 10)) as char, - (0xDC00 | (bv & 0x03FF)) as char] - } - - function Utf16Encode1(cp: uint32): seq { - if cp < 0xD800 || 0xDBFF < cp < 0x10000 then - [cp as char] - else if 0x10000 <= cp < 0x110000 then - Utf16Encode2(cp - 0x10000) - else - [] // Invalid: drop // TODO - } - - function Utf16Encode(codepoints: seq, start: nat := 0): seq - decreases |codepoints| - start - { - if start >= |codepoints| then [] - else Utf16Encode1(codepoints[start]) + Utf16Encode(codepoints, start + 1) - } - - function Utf8Encode1(cp: uint32): seq { - var bv := cp as bv32; - if cp < 0x80 then - [cp as uint8] - else if cp < 0x0800 then - [(((bv >> 6) & 0x1F) | 0xC0) as uint8, - ( (bv & 0x3F) | 0x80) as uint8] - else if cp < 0x10000 then - [(((bv >> 12) & 0x0F) | 0xE0) as uint8, - (((bv >> 6) & 0x3F) | 0x80) as uint8, - ( (bv & 0x3F) | 0x80) as uint8] - else if cp < 0x110000 then - [(((bv >> 18) & 0x07) | 0xF0) as uint8, - (((bv >> 12) & 0x3F) | 0x80) as uint8, - (((bv >> 6) & 0x3F) | 0x80) as uint8, - ( (bv & 0x3F) | 0x80) as uint8] - else - [] // Invalid: drop // TODO - } - - newtype opt_uint8 = c: int | -1 <= c as int < TWO_TO_THE_8 - - function Utf8DecodeChars(c0: uint8, c1: opt_uint8, c2: opt_uint8, c3: opt_uint8): (r: (uint32, uint8)) - ensures r.1 in {1, 2, 3, 4} - ensures c1 == -1 ==> r.1 <= 1 - ensures c2 == -1 ==> r.1 <= 2 - ensures c3 == -1 ==> r.1 <= 3 - { - if (c0 as bv32 & 0x80) == 0 then - (c0 as uint32, 1) - else if (c0 as bv32 & 0xE0) == 0xC0 && c1 > -1 then - (( (((c0 as bv32) & 0x1F) << 6) - | ((c1 as bv32) & 0x3F )) as uint32, - 2) - else if (c0 as bv32 & 0xF0) == 0xE0 && c1 > -1 && c2 > -1 then - (( (((c0 as bv32) & 0x0F) << 12) - | (((c1 as bv32) & 0x3F) << 6) - | ( (c2 as bv32) & 0x3F )) as uint32, - 3) - else if (c0 as bv32 & 0xF8) == 0xF0 && c1 > -1 && c2 > -1 && c3 > -1 then - (( (((c0 as bv32) & 0x07) << 18) - | (((c1 as bv32) & 0x3F) << 12) - | (((c2 as bv32) & 0x3F) << 6) - | ( (c3 as bv32) & 0x3F )) as uint32, - 4) - else - (0xFFFD, 1) // Replacement character - } - - function Utf8Decode(str: seq, start: nat := 0): seq - decreases |str| - start - { - if start >= |str| then [] - else - var c0 := str[start] as uint8; - var c1 := if |str| > start + 1 then str[start + 1] as opt_uint8 else -1; - var c2 := if |str| > start + 2 then str[start + 2] as opt_uint8 else -1; - var c3 := if |str| > start + 3 then str[start + 3] as opt_uint8 else -1; - var (cp, delta) := Utf8DecodeChars(c0, c1, c2, c3); - [cp] + Utf8Decode(str, start + delta as nat) - } - - function Utf8Encode(codepoints: seq, start: nat := 0): seq - decreases |codepoints| - start - { - if start >= |codepoints| then [] - else Utf8Encode1(codepoints[start]) + Utf8Encode(codepoints, start + 1) - } - - function Transcode16To8(s: string): seq { - Utf8Encode(Utf16Decode(s)) - } - - function Transcode8To16(s: seq): string { - Utf16Encode(Utf8Decode(s)) - } - - function ASCIIToBytes(s: string): seq - // Keep ASCII characters in `s` and discard all other characters - { - seq(|s|, idx requires 0 <= idx < |s| => - if s[idx] as uint16 < 128 then s[idx] as uint8 - else 0 as uint8) - } -} diff --git a/src/Unicode/UnicodeEncodingForm.dfy b/src/Unicode/UnicodeEncodingForm.dfy index d7f8a463..854ed1fb 100644 --- a/src/Unicode/UnicodeEncodingForm.dfy +++ b/src/Unicode/UnicodeEncodingForm.dfy @@ -239,6 +239,23 @@ abstract module {:options "-functionSyntax:4"} UnicodeEncodingForm { LemmaFlattenMinimalWellFormedCodeUnitSubsequences(ms); Seq.Flatten(ms) } + by method { + // Optimize to to avoid allocating the intermediate unflattened sequence. + // We can't quite use Seq.FlatMap easily because we need to prove the result + // is not just a seq but a WellFormedCodeUnitSeq. + // TODO: We can be even more efficient by using a JSON.Utils.Vectors.Vector instead. + s := []; + ghost var unflattened: seq := []; + for i := |vs| downto 0 + invariant unflattened == Seq.Map(EncodeScalarValue, vs[i..]) + invariant s == Seq.Flatten(unflattened) + { + var next: MinimalWellFormedCodeUnitSeq := EncodeScalarValue(vs[i]); + unflattened := [next] + unflattened; + LemmaPrependMinimalWellFormedCodeUnitSubsequence(next, s); + s := next + s; + } + } /** * Returns the scalar value sequence encoded by the given well-formed Unicode string. diff --git a/src/Unicode/UnicodeStrings.dfy b/src/Unicode/UnicodeStrings.dfy new file mode 100644 index 00000000..c2e2cf5d --- /dev/null +++ b/src/Unicode/UnicodeStrings.dfy @@ -0,0 +1,62 @@ +// RUN: %verify %s + +/// Converting between strings and UTF-8/UTF-16 +/// ============================================= +/// +/// This abstract module defines an interface for converting +/// between the Dafny `string` type and sequences of UTF-8 and UTF-16 +/// codepoints, where codepoints are represents as bounded ints +/// (`uint8` and `uint16`). +/// +/// Because the `--unicode-char` option changes the meaning of the `char` +/// type and hence the `string` type, there are two different concrete +/// implementations in separate files. +/// Only include the one that matches your value of that option! +/// +/// If you also want to maintain code that works for either mode, +/// you have two options: +/// +/// 1. Implement your logic in an abstract module as well that +/// imports this one, and define two different refining modules +/// that import the appropriate UnicodeStrings module. +/// See (TODO example) for an example. +/// 2. Do not `include` any of these three files in your source code, +/// instead passing the appropriate file to Dafny when verifying and building, +/// so that references to `UnicodeStrings` can be resolved. +/// +/// Option 2. avoids needing to write boilerplate refining modules, +/// but is less IDE-friendly until we have better project configuration support. + +include "../BoundedInts.dfy" +include "../Wrappers.dfy" +include "../Collections/Sequences/Seq.dfy" +include "Utf8EncodingForm.dfy" +include "Utf16EncodingForm.dfy" + +abstract module {:options "-functionSyntax:4"} AbstractUnicodeStrings { + + import Seq + + import opened Wrappers + import opened BoundedInts + + function ToUTF8Checked(s: string): Option> + + function ASCIIToUTF8(s: string): seq + requires forall i | 0 <= i < |s| :: 0 <= s[i] as int < 128 + { + Seq.Map(c requires 0 <= c as int < 128 => c as uint8, s) + } + + function FromUTF8Checked(bs: seq): Option + + function ToUTF16Checked(s: string): Option> + + function ASCIIToUTF16(s: string): seq + requires forall i | 0 <= i < |s| :: 0 <= s[i] as int < 128 + { + Seq.Map(c requires 0 <= c as int < 128 => c as uint16, s) + } + + function FromUTF16Checked(bs: seq): Option +} \ No newline at end of file diff --git a/src/Unicode/UnicodeStringsWithUnicodeChar.dfy b/src/Unicode/UnicodeStringsWithUnicodeChar.dfy new file mode 100644 index 00000000..3f9d1259 --- /dev/null +++ b/src/Unicode/UnicodeStringsWithUnicodeChar.dfy @@ -0,0 +1,85 @@ +// RUN: %verify --unicode-char:true %s + +/// Converting between strings and UTF-8/UTF-16 +/// ============================================= +/// +/// Implementation of `AbstractUnicodeStrings` for `--unicode-char:true`. +/// See `UnicodeStrings.dfy` for details. + +include "UnicodeStrings.dfy" +include "../Wrappers.dfy" +include "../Collections/Sequences/Seq.dfy" + +module {:options "-functionSyntax:4"} UnicodeStrings refines AbstractUnicodeStrings { + + import Unicode + import Utf8EncodingForm + import Utf16EncodingForm + + lemma {:vcs_split_on_every_assert} CharIsUnicodeScalarValue(c: char) + ensures + && var asBits := c as bv24; + && asBits <= 0x10_FFFF + && (0 <= asBits < Unicode.HIGH_SURROGATE_MIN || Unicode.LOW_SURROGATE_MAX < asBits) + { + assert c as int < 0x11_0000; + // TODO: Doesn't verify and not sure what else to try + assert c as int as bv24 < 0x11_0000 as bv24; + var asBits := c as int as bv24; + assert (asBits < Unicode.HIGH_SURROGATE_MIN || asBits > Unicode.LOW_SURROGATE_MAX); + assert asBits <= 0x10_FFFF; + } + + lemma UnicodeScalarValueIsChar(sv: Unicode.ScalarValue) + ensures + && var asInt := sv as int; + && (0 <= asInt < 0xD800 || 0xE000 <= asInt < 0x11_0000) + { + var asInt := sv as int; + assert (asInt < 0xD800 || asInt > 0xDFFF); + assert (asInt < 0xDBFF || asInt > 0xDC00); + } + + function CharAsUnicodeScalarValue(c: char): Unicode.ScalarValue { + CharIsUnicodeScalarValue(c); + c as Unicode.ScalarValue + } + + function CharFromUnicodeScalarValue(sv: Unicode.ScalarValue): char { + UnicodeScalarValueIsChar(sv); + // TODO: Can we avoid the extra cast to int? + sv as int as char + } + + function ToUTF8Checked(s: string): Option> + ensures ToUTF8Checked(s).Some? + { + var asCodeUnits := Seq.Map(CharAsUnicodeScalarValue, s); + var asUtf8CodeUnits := Utf8EncodingForm.EncodeScalarSequence(asCodeUnits); + var asBytes := Seq.Map(cu => cu as uint8, asUtf8CodeUnits); + Some(asBytes) + } + + function FromUTF8Checked(bs: seq): Option { + var asCodeUnits := Seq.Map(c => c as Utf8EncodingForm.CodeUnit, bs); + var utf32 :- Utf8EncodingForm.DecodeCodeUnitSequenceChecked(asCodeUnits); + var asChars := Seq.Map(CharFromUnicodeScalarValue, utf32); + Some(asChars) + } + + function ToUTF16Checked(s: string): Option> + ensures ToUTF16Checked(s).Some? + { + var asCodeUnits := Seq.Map(CharAsUnicodeScalarValue, s); + var asUtf16CodeUnits := Utf16EncodingForm.EncodeScalarSequence(asCodeUnits); + var asBytes := Seq.Map(cu => cu as uint16, asUtf16CodeUnits); + Some(asBytes) + } + + function FromUTF16Checked(bs: seq): Option { + var asCodeUnits := Seq.Map(c => c as Utf16EncodingForm.CodeUnit, bs); + var utf32 :- Utf16EncodingForm.DecodeCodeUnitSequenceChecked(asCodeUnits); + var asChars := Seq.Map(CharFromUnicodeScalarValue, utf32); + Some(asChars) + } +} \ No newline at end of file diff --git a/src/Unicode/UnicodeStringsWithoutUnicodeChar.dfy b/src/Unicode/UnicodeStringsWithoutUnicodeChar.dfy new file mode 100644 index 00000000..2a095d25 --- /dev/null +++ b/src/Unicode/UnicodeStringsWithoutUnicodeChar.dfy @@ -0,0 +1,52 @@ +// RUN: %verify --unicode-char:false %s + +/// Converting between strings and UTF-8/UTF-16 +/// ============================================= +/// +/// Implementation of `AbstractUnicodeStrings` for `--unicode-char:false`. +/// See `UnicodeStrings.dfy` for details. + +include "UnicodeStrings.dfy" +include "../Wrappers.dfy" +include "../Collections/Sequences/Seq.dfy" + +module {:options "-functionSyntax:4"} UnicodeStrings refines AbstractUnicodeStrings { + + import Unicode + import Utf8EncodingForm + import Utf16EncodingForm + + predicate IsWellFormedString(s: string) + ensures |s| == 0 ==> IsWellFormedString(s) + { + Utf16EncodingForm.IsWellFormedCodeUnitSequence(Seq.Map(c => c as Utf16EncodingForm.CodeUnit, s)) + } + + function ToUTF8Checked(s: string): Option> { + var asCodeUnits := Seq.Map(c => c as Utf16EncodingForm.CodeUnit, s); + var utf32 :- Utf16EncodingForm.DecodeCodeUnitSequenceChecked(asCodeUnits); + var asUtf8CodeUnits := Utf8EncodingForm.EncodeScalarSequence(utf32); + Some(Seq.Map(c => c as byte, asUtf8CodeUnits)) + } + + function {:vcs_split_on_every_assert} FromUTF8Checked(bs: seq): Option { + var asCodeUnits := Seq.Map(c => c as Utf8EncodingForm.CodeUnit, bs); + var utf32 :- Utf8EncodingForm.DecodeCodeUnitSequenceChecked(asCodeUnits); + var asUtf16CodeUnits := Utf16EncodingForm.EncodeScalarSequence(utf32); + Some(Seq.Map(cu => cu as char, asUtf16CodeUnits)) + } + + function ToUTF16Checked(s: string): Option> { + if Utf16EncodingForm.IsWellFormedCodeUnitSequence(Seq.Map(c => c as Utf16EncodingForm.CodeUnit, s)) then + Some(Seq.Map(c => c as uint16, s)) + else + None + } + + function FromUTF16Checked(bs: seq): Option { + if Utf16EncodingForm.IsWellFormedCodeUnitSequence(Seq.Map(c => c as Utf16EncodingForm.CodeUnit, bs)) then + Some(Seq.Map(c => c as char, bs)) + else + None + } +} \ No newline at end of file diff --git a/src/Wrappers.dfy b/src/Wrappers.dfy index 852d9a8c..167cb574 100644 --- a/src/Wrappers.dfy +++ b/src/Wrappers.dfy @@ -14,6 +14,12 @@ module {:options "-functionSyntax:4"} Wrappers { case None() => Failure("Option is None") } + function ToResult'(error: R): Result { + match this + case Some(v) => Success(v) + case None() => Failure(error) + } + function UnwrapOr(default: T): T { match this case Some(v) => v