diff --git a/quality/lr/README.md b/quality/lr/README.md new file mode 100644 index 0000000..6f20a6f --- /dev/null +++ b/quality/lr/README.md @@ -0,0 +1,27 @@ +# Quality Model Tool + +- The python script ```qualityestimator_json_to_bin.py``` converts a logistic regressor quality estimator model from json to binary file and vice versa. + +- To converts a json to binary: + +```console + python qualityestimator_json_to_bin.py --to_json qe_model.json --out qe_model.bin +``` + +- To converts a binary to json: + +```console + python qualityestimator_json_to_bin.py --from_json qe_model.bin --out qe_model.json +``` + +- The json must follow this structure: +```json +{ + "mean_": [ 0.0, 0.0, 0.0, 0.0, ], + "scale_": [ 0.0, 0.0, 0.0, 0.0, ], + "coef_": [ 0.0, 0.0, 0.0, 0.0, ], + "intercept_": 0.0 +} +``` + +- The binary file will have the following structure defined on [LogisticRegressorQualityEstimator](https://github.com/browsermt/bergamot-translator/blob/main/src/translator/quality_estimator.h#L100-L108). diff --git a/quality/lr/qualityestimator_json_to_bin.py b/quality/lr/qualityestimator_json_to_bin.py new file mode 100644 index 0000000..68c5028 --- /dev/null +++ b/quality/lr/qualityestimator_json_to_bin.py @@ -0,0 +1,91 @@ +import argparse +import json +import struct +from collections import namedtuple + +# magic(uint64_t), lrParametersDims(uint64_t) +Header_fmt = "<1Q1Q" +Header_len = struct.calcsize(Header_fmt) + +QE_MAGIC_NUMBER = 8704388732126802304 + + +def from_qe_file(file): + magic, paramDim = struct.unpack(Header_fmt, file.read(Header_len)) + + if magic != QE_MAGIC_NUMBER: + print("Invalid quality estimator file.") + exit(1) + + # scale_[N] + mean_[N] + coef_[N] + intercept_ + lrParams_fmt = f"<{3*paramDim+1}f" + + lrParams_size = struct.calcsize(lrParams_fmt) + + params = list(struct.unpack(lrParams_fmt, file.read(lrParams_size))) + + lrParams = {} + lrParams["scale_"] = params[:paramDim] + lrParams["mean_"] = params[paramDim : 2 * paramDim] + lrParams["coef_"] = params[2 * paramDim : 3 * paramDim] + lrParams["intercept_"] = params[3 * paramDim] + + return lrParams + + +def to_binary(lrParams): + + paramDims = len(lrParams["scale_"]) + + if paramDims != len(lrParams["mean_"]) and paramDims != len( + lrParams["coef_"] + ): + print("Invalid LR parameters.") + exit(1) + + lrParams_fmt = f"<{3*paramDims+1}f" + + params = ( + lrParams["scale_"] + + lrParams["mean_"] + + lrParams["coef_"] + + [lrParams["intercept_"]] + ) + + return struct.pack(Header_fmt, QE_MAGIC_NUMBER, paramDims) + struct.pack( + lrParams_fmt, *params + ) + + +parser = argparse.ArgumentParser(description="Read and write quality estimator files.") +parser.add_argument( + "--to_json", type=argparse.FileType("rb"), help="Read quality estimator file" +) +parser.add_argument( + "--from_json", + type=argparse.FileType("r"), + help="Read json file and generate quality estimator binary", +) +parser.add_argument( + "--out", + type=argparse.FileType("wb"), + help="Output generated data from to_json or from_json option", +) + +args = parser.parse_args() + +output = None + +if args.to_json: + output = json.dumps(from_qe_file(args.to_json), indent=3) +elif args.from_json: + output = to_binary(json.loads(args.from_json.read())) + +if output is None: + exit(0) + +if args.out: + args.out.write(output.encode("UTF-8") if type(output) is str else output) + args.out.close() +else: + print(output)