Skip to content

Commit

Permalink
Use hex escape sequences instead of octal escape sequences.
Browse files Browse the repository at this point in the history
Octal escape sequences the least used form of escape sequences and hex supported everywhere.
The only outsiders are Java, C++ and Rust
  • Loading branch information
Mingun committed Mar 28, 2024
1 parent 15959f0 commit e2f65df
Show file tree
Hide file tree
Showing 8 changed files with 52 additions and 34 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -407,7 +407,7 @@ class TranslatorSpec extends AnyFunSpec {
GoCompiler -> "[]uint8{34, 0, 10, 64, 65, 66, 92}",
JavaCompiler -> "new byte[] { 34, 0, 10, 64, 65, 66, 92 }",
JavaScriptCompiler -> "[34, 0, 10, 64, 65, 66, 92]",
LuaCompiler -> "\"\\034\\000\\010\\064\\065\\066\\092\"",
LuaCompiler -> "\"\\x22\\x00\\x0A\\x40\\x41\\x42\\x5C\"",
PerlCompiler -> "pack('C*', (34, 0, 10, 64, 65, 66, 92))",
PHPCompiler -> "\"\\x22\\x00\\x0A\\x40\\x41\\x42\\x5C\"",
PythonCompiler -> "b\"\\x22\\x00\\x0A\\x40\\x41\\x42\\x5C\"",
Expand All @@ -420,7 +420,7 @@ class TranslatorSpec extends AnyFunSpec {
GoCompiler -> "[]uint8{255, 0, 255}",
JavaCompiler -> "new byte[] { -1, 0, -1 }",
JavaScriptCompiler -> "[255, 0, 255]",
LuaCompiler -> "\"\\255\\000\\255\"",
LuaCompiler -> "\"\\xFF\\x00\\xFF\"",
PerlCompiler -> "pack('C*', (255, 0, 255))",
PHPCompiler -> "\"\\xFF\\x00\\xFF\"",
PythonCompiler -> "b\"\\xFF\\x00\\xFF\"",
Expand All @@ -435,7 +435,7 @@ class TranslatorSpec extends AnyFunSpec {
GoCompiler -> "len([]uint8{0, 1, 2})",
JavaCompiler -> "new byte[] { 0, 1, 2 }.length",
JavaScriptCompiler -> "[0, 1, 2].length",
LuaCompiler -> "#\"\\000\\001\\002\"",
LuaCompiler -> "#\"\\x00\\x01\\x02\"",
PerlCompiler -> "length(pack('C*', (0, 1, 2)))",
PHPCompiler -> "strlen(\"\\x00\\x01\\x02\")",
PythonCompiler -> "len(b\"\\x00\\x01\\x02\")",
Expand Down Expand Up @@ -555,14 +555,14 @@ class TranslatorSpec extends AnyFunSpec {
full("\"str\\0next\"", CalcIntType, CalcStrType, ResultMap(
CppCompiler -> "std::string(\"str\\000next\", 8)",
CSharpCompiler -> "\"str\\0next\"",
GoCompiler -> "\"str\\000next\"",
GoCompiler -> "\"str\\x00next\"",
JavaCompiler -> "\"str\\000next\"",
JavaScriptCompiler -> "\"str\\x00next\"",
LuaCompiler -> "\"str\\000next\"",
PerlCompiler -> "\"str\\000next\"",
PHPCompiler -> "\"str\\000next\"",
PythonCompiler -> "u\"str\\000next\"",
RubyCompiler -> "\"str\\000next\""
LuaCompiler -> "\"str\\x00next\"",
PerlCompiler -> "\"str\\x00next\"",
PHPCompiler -> "\"str\\x00next\"",
PythonCompiler -> "u\"str\\x00next\"",
RubyCompiler -> "\"str\\x00next\""
))
}

Expand Down
1 change: 0 additions & 1 deletion shared/src/main/scala/io/kaitai/struct/JSON.scala
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@ object JSON extends CommonLiterals {
}
}

/** octal escapes (which [[translators.CommonLiterals.strLiteralGenericCC]] uses by default) are not allowed in JSON */
override def strLiteralGenericCC(code: Char): String = strLiteralUnicode(code)

def stringToJson(str: String): String =
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ trait CommonLiterals {
/**
* Handle ASCII character conversion for inlining into string literals.
* Default implementation consults [[asciiCharQuoteMap]] first, then
* just dumps it as is if it's a printable ASCII charcter, or calls
* just dumps it as is if it's a printable ASCII character, or calls
* [[strLiteralGenericCC]] if it's a control character.
* @param code character code to convert into string for inclusion in
* a string literal
Expand All @@ -53,18 +53,14 @@ trait CommonLiterals {

/**
* Converts generic control character code into something that's allowed
* inside a string literal. Default implementation uses octal encoding,
* inside a string literal. Default implementation uses hex encoding,
* which is ok for most C-derived languages.
*
* Note that we use strictly 3 octal digits to work around potential
* problems with following decimal digits, i.e. "\0" + "2" that would be
* parsed as single character "\02" = "\x02", instead of two characters
* "\x00\x32".
* @param code character code to represent
* @return string literal representation of given code
*/
def strLiteralGenericCC(code: Char): String =
"\\%03o".format(code.toInt)
"\\x%02x".format(code.toInt)

/**
* Converts Unicode (typically, non-ASCII) character code into something
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,21 @@ class CppTranslator(provider: TypeProvider, importListSrc: CppImportList, import
override def doByteArrayLiteral(arr: Seq[Byte]): String =
"std::string(\"" + Utils.hexEscapeByteArray(arr) + "\", " + arr.length + ")"

/**
* Hex escapes in C++ does not limited in length, so we use octal, as they are shorter.
*
* Note that we use strictly 3 octal digits to work around potential
* problems with following decimal digits, i.e. "\0" + "2" that would be
* parsed as single character "\02" = "\x02", instead of two characters
* "\x00\x32".
*
* @see https://en.cppreference.com/w/cpp/language/escape
* @param code character code to represent
* @return string literal representation of given code
*/
override def strLiteralGenericCC(code: Char): String =
"\\%03o".format(code.toInt)

override def genericBinOp(left: Ast.expr, op: Ast.operator, right: Ast.expr, extPrec: Int) = {
(detectType(left), detectType(right), op) match {
case (_: IntType, _: IntType, Ast.operator.Mod) =>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,19 +11,6 @@ class JavaScriptTranslator(provider: TypeProvider) extends BaseTranslator(provid
override def doByteArrayNonLiteral(elts: Seq[Ast.expr]): String =
s"new Uint8Array([${elts.map(translate).mkString(", ")}])"

/**
* JavaScript rendition of common control character that would use hex form,
* not octal. "Octal" control character string literals might be accepted
* in non-strict JS mode, but in strict mode only hex or unicode are ok.
* Here we'll use hex, as they are shorter.
*
* @see https://github.com/kaitai-io/kaitai_struct/issues/279
* @param code character code to represent
* @return string literal representation of given code
*/
override def strLiteralGenericCC(code: Char): String =
"\\x%02x".format(code.toInt)

override def genericBinOp(left: Ast.expr, op: Ast.operator, right: Ast.expr, extPrec: Int) = {
(detectType(left), detectType(right), op) match {
case (_: IntType, _: IntType, Ast.operator.Div) =>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,21 @@ class JavaTranslator(provider: TypeProvider, importList: ImportList) extends Bas
override def doByteArrayNonLiteral(elts: Seq[expr]): String =
s"new byte[] { ${elts.map(translate).mkString(", ")} }"

/**
* Java does not support two-digit hex escape sequences, so use octal, as they are shorter.
*
* Note that we use strictly 3 octal digits to work around potential
* problems with following decimal digits, i.e. "\0" + "2" that would be
* parsed as single character "\02" = "\x02", instead of two characters
* "\x00\x32".
*
* @see https://docs.oracle.com/javase/specs/jls/se7/html/jls-3.html#jls-3.10.6
* @param code character code to represent
* @return string literal representation of given code
*/
override def strLiteralGenericCC(code: Char): String =
"\\%03o".format(code.toInt)

override def genericBinOp(left: Ast.expr, op: Ast.operator, right: Ast.expr, extPrec: Int) = {
(detectType(left), detectType(right), op) match {
case (_: IntType, _: IntType, Ast.operator.Mod) =>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,6 @@ class LuaTranslator(provider: TypeProvider, importList: ImportList) extends Base
'\b' -> "\\b",
'\u000b' -> "\\v",
'\f' -> "\\f",
'\u001b' -> "\\027"
)

override def strLiteralUnicode(code: Char): String =
Expand Down Expand Up @@ -187,12 +186,12 @@ class LuaTranslator(provider: TypeProvider, importList: ImportList) extends Base
}

/**
* Converts byte array (Seq[Byte]) into decimal-escaped Lua-style literal
* characters (i.e. like \255).
* Converts byte array (Seq[Byte]) into hex-escaped Lua-style literal
* characters (i.e. like \xFF).
*
* @param arr byte array to escape
* @return array contents decimal-escaped as string
*/
private def decEscapeByteArray(arr: Seq[Byte]): String =
arr.map((x) => "\\%03d".format(x & 0xff)).mkString
arr.map((x) => "\\x%02x".format(x & 0xff)).mkString
}
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,13 @@ class RustTranslator(provider: TypeProvider, config: RuntimeConfig) extends Base
'\\' -> "\\\\"
)

/**
* Hex escapes in form `\xHH` in Rust allows only codes in the range 0x00 - 0x7f.
*
* @see https://doc.rust-lang.org/reference/tokens.html#examples
* @param code character code to represent
* @return string literal representation of given code
*/
override def strLiteralUnicode(code: Char): String =
"\\u{%x}".format(code.toInt)

Expand Down

0 comments on commit e2f65df

Please sign in to comment.