Skip to content

Commit

Permalink
Use hex escape sequences instead of octal escape sequences.
Browse files Browse the repository at this point in the history
Octal escape sequences the least used form of escape sequences and hex supported everywhere.
The only outsiders are Java, C++ and Rust
  • Loading branch information
Mingun committed Oct 4, 2024
1 parent 542b241 commit 9a627bd
Show file tree
Hide file tree
Showing 8 changed files with 50 additions and 42 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -407,7 +407,7 @@ class TranslatorSpec extends AnyFunSpec {
GoCompiler -> "[]uint8{34, 0, 10, 64, 65, 66, 92}",
JavaCompiler -> "new byte[] { 34, 0, 10, 64, 65, 66, 92 }",
JavaScriptCompiler -> "new Uint8Array([34, 0, 10, 64, 65, 66, 92])",
LuaCompiler -> "\"\\034\\000\\010\\064\\065\\066\\092\"",
LuaCompiler -> "\"\\x22\\x00\\x0A\\x40\\x41\\x42\\x5C\"",
PerlCompiler -> "pack('C*', (34, 0, 10, 64, 65, 66, 92))",
PHPCompiler -> "\"\\x22\\x00\\x0A\\x40\\x41\\x42\\x5C\"",
PythonCompiler -> "b\"\\x22\\x00\\x0A\\x40\\x41\\x42\\x5C\"",
Expand All @@ -420,7 +420,7 @@ class TranslatorSpec extends AnyFunSpec {
GoCompiler -> "[]uint8{255, 0, 255}",
JavaCompiler -> "new byte[] { -1, 0, -1 }",
JavaScriptCompiler -> "new Uint8Array([255, 0, 255])",
LuaCompiler -> "\"\\255\\000\\255\"",
LuaCompiler -> "\"\\xFF\\x00\\xFF\"",
PerlCompiler -> "pack('C*', (255, 0, 255))",
PHPCompiler -> "\"\\xFF\\x00\\xFF\"",
PythonCompiler -> "b\"\\xFF\\x00\\xFF\"",
Expand All @@ -435,7 +435,7 @@ class TranslatorSpec extends AnyFunSpec {
GoCompiler -> "len([]uint8{0, 1, 2})",
JavaCompiler -> "new byte[] { 0, 1, 2 }.length",
JavaScriptCompiler -> "new Uint8Array([0, 1, 2]).length",
LuaCompiler -> "#\"\\000\\001\\002\"",
LuaCompiler -> "#\"\\x00\\x01\\x02\"",
PerlCompiler -> "length(pack('C*', (0, 1, 2)))",
PHPCompiler -> "strlen(\"\\x00\\x01\\x02\")",
PythonCompiler -> "len(b\"\\x00\\x01\\x02\")",
Expand Down Expand Up @@ -555,14 +555,14 @@ class TranslatorSpec extends AnyFunSpec {
full("\"str\\0next\"", CalcIntType, CalcStrType, ResultMap(
CppCompiler -> "std::string(\"str\\000next\", 8)",
CSharpCompiler -> "\"str\\0next\"",
GoCompiler -> "\"str\\000next\"",
GoCompiler -> "\"str\\x00next\"",
JavaCompiler -> "\"str\\000next\"",
JavaScriptCompiler -> "\"str\\x00next\"",
LuaCompiler -> "\"str\\000next\"",
PerlCompiler -> "\"str\\000next\"",
PHPCompiler -> "\"str\\000next\"",
PythonCompiler -> "u\"str\\000next\"",
RubyCompiler -> "\"str\\000next\""
LuaCompiler -> "\"str\\x00next\"",
PerlCompiler -> "\"str\\x00next\"",
PHPCompiler -> "\"str\\x00next\"",
PythonCompiler -> "u\"str\\x00next\"",
RubyCompiler -> "\"str\\x00next\""
))
}

Expand Down
1 change: 0 additions & 1 deletion shared/src/main/scala/io/kaitai/struct/JSON.scala
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@ object JSON extends CommonLiterals {
}
}

/** octal escapes (which [[translators.CommonLiterals.strLiteralGenericCC]] uses by default) are not allowed in JSON */
override def strLiteralGenericCC(code: Char): String = strLiteralUnicode(code)

def stringToJson(str: String): String =
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ trait CommonLiterals {
/**
* Handle ASCII character conversion for inlining into string literals.
* Default implementation consults [[asciiCharQuoteMap]] first, then
* just dumps it as is if it's a printable ASCII charcter, or calls
* just dumps it as is if it's a printable ASCII character, or calls
* [[strLiteralGenericCC]] if it's a control character.
* @param code character code to convert into string for inclusion in
* a string literal
Expand All @@ -53,18 +53,14 @@ trait CommonLiterals {

/**
* Converts generic control character code into something that's allowed
* inside a string literal. Default implementation uses octal encoding,
* inside a string literal. Default implementation uses hex encoding,
* which is ok for most C-derived languages.
*
* Note that we use strictly 3 octal digits to work around potential
* problems with following decimal digits, i.e. "\0" + "2" that would be
* parsed as single character "\02" = "\x02", instead of two characters
* "\x00\x32".
* @param code character code to represent
* @return string literal representation of given code
*/
def strLiteralGenericCC(code: Char): String =
"\\%03o".format(code.toInt)
"\\x%02X".format(code.toInt)

/**
* Converts Unicode (typically, non-ASCII) character code into something
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,21 @@ class CppTranslator(provider: TypeProvider, importListSrc: CppImportList, import
}
}

/**
* Hex escapes in C++ does not limited in length, so we use octal, as they are shorter.
*
* Note that we use strictly 3 octal digits to work around potential
* problems with following decimal digits, i.e. "\0" + "2" that would be
* parsed as single character "\02" = "\x02", instead of two characters
* "\x00\x32".
*
* @see https://en.cppreference.com/w/cpp/language/escape
* @param code character code to represent
* @return string literal representation of given code
*/
override def strLiteralGenericCC(code: Char): String =
"\\%03o".format(code.toInt)

override def genericBinOp(left: Ast.expr, op: Ast.operator, right: Ast.expr, extPrec: Int) = {
(detectType(left), detectType(right), op) match {
case (_: IntType, _: IntType, Ast.operator.Mod) =>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,19 +13,6 @@ class JavaScriptTranslator(provider: TypeProvider, importList: ImportList) exten
override def doByteArrayNonLiteral(elts: Seq[Ast.expr]): String =
s"new Uint8Array([${elts.map(translate).mkString(", ")}])"

/**
* JavaScript rendition of common control character that would use hex form,
* not octal. "Octal" control character string literals might be accepted
* in non-strict JS mode, but in strict mode only hex or unicode are ok.
* Here we'll use hex, as they are shorter.
*
* @see https://github.com/kaitai-io/kaitai_struct/issues/279
* @param code character code to represent
* @return string literal representation of given code
*/
override def strLiteralGenericCC(code: Char): String =
"\\x%02x".format(code.toInt)

override def genericBinOp(left: Ast.expr, op: Ast.operator, right: Ast.expr, extPrec: Int) = {
(detectType(left), detectType(right), op) match {
case (_: IntType, _: IntType, Ast.operator.Div) =>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,21 @@ class JavaTranslator(provider: TypeProvider, importList: ImportList) extends Bas
override def doByteArrayNonLiteral(elts: Seq[expr]): String =
s"new byte[] { ${elts.map(translate).mkString(", ")} }"

/**
* Java does not support two-digit hex escape sequences, so use octal, as they are shorter.
*
* Note that we use strictly 3 octal digits to work around potential
* problems with following decimal digits, i.e. "\0" + "2" that would be
* parsed as single character "\02" = "\x02", instead of two characters
* "\x00\x32".
*
* @see https://docs.oracle.com/javase/specs/jls/se7/html/jls-3.html#jls-3.10.6
* @param code character code to represent
* @return string literal representation of given code
*/
override def strLiteralGenericCC(code: Char): String =
"\\%03o".format(code.toInt)

override def genericBinOp(left: Ast.expr, op: Ast.operator, right: Ast.expr, extPrec: Int) = {
(detectType(left), detectType(right), op) match {
case (_: IntType, _: IntType, Ast.operator.Mod) =>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,6 @@ class LuaTranslator(provider: TypeProvider, importList: ImportList) extends Base
'\b' -> "\\b",
'\u000b' -> "\\v",
'\f' -> "\\f",
'\u001b' -> "\\027"
)

override def strLiteralUnicode(code: Char): String =
Expand Down Expand Up @@ -71,7 +70,7 @@ class LuaTranslator(provider: TypeProvider, importList: ImportList) extends Base
override def doArrayLiteral(t: DataType, value: Seq[Ast.expr]): String =
"{" + value.map((v) => translate(v)).mkString(", ") + "}"
override def doByteArrayLiteral(arr: Seq[Byte]): String =
"\"" + decEscapeByteArray(arr) + "\""
"\"" + Utils.hexEscapeByteArray(arr) + "\""
override def doByteArrayNonLiteral(values: Seq[Ast.expr]): String =
// It is assumed that every expression produces integer in the range [0; 255]
"string.char(" + values.map(translate).mkString(", ") + ")"
Expand Down Expand Up @@ -189,14 +188,4 @@ class LuaTranslator(provider: TypeProvider, importList: ImportList) extends Base
case Ast.unaryop.Not => "not"
case _ => super.unaryOp(op)
}

/**
* Converts byte array (Seq[Byte]) into decimal-escaped Lua-style literal
* characters (i.e. like \255).
*
* @param arr byte array to escape
* @return array contents decimal-escaped as string
*/
private def decEscapeByteArray(arr: Seq[Byte]): String =
arr.map((x) => "\\%03d".format(x & 0xff)).mkString
}
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,13 @@ class RustTranslator(provider: TypeProvider, config: RuntimeConfig)
override def strLiteralGenericCC(code: Char): String =
strLiteralUnicode(code)

/**
* Hex escapes in form `\xHH` in Rust allows only codes in the range 0x00 - 0x7f.
*
* @see https://doc.rust-lang.org/reference/tokens.html#examples
* @param code character code to represent
* @return string literal representation of given code
*/
override def strLiteralUnicode(code: Char): String =
"\\u{%x}".format(code.toInt)

Expand Down

0 comments on commit 9a627bd

Please sign in to comment.