-
Notifications
You must be signed in to change notification settings - Fork 0
/
build.rs
135 lines (114 loc) · 3.66 KB
/
build.rs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
#![feature(iter_intersperse)]
use std::env;
use std::fs;
use std::path::Path;
use std::str;
use std::str::from_utf8;
use regex::Captures;
use regex::Regex;
const C_URL: &str = "https://raw.githubusercontent.com/InterNetNews/inn/5151f24ef885f7e18fc582c64b33fe10a1751636/tests/lib/uwildmat-t.c";
const RUST_FILENAME: &str = "gen_test_suite.rs";
#[tokio::main]
async fn main() {
let out_dir = env::var_os("OUT_DIR").unwrap();
let dest_path = Path::new(&out_dir).join(RUST_FILENAME);
// get the c source code
let c_code = fetch_code().await;
// extract the test suite
let c_tests: String = extract_suite(c_code);
// put multiline statements on single lines
let single_lines = c_tests.replace(",\n", ", ");
// replace C byte string sequences with from_utf8_unchecked calls
let rustified = to_unchecked_calls(single_lines);
// skip malformed utf-8 byte sequences
let no_malformed = skip_malformed(rustified);
// generate rust source code
let rust_code = make_rust_source(no_malformed);
// write to file
fs::write(&dest_path, rust_code).unwrap();
println!("cargo:rerun-if-changed=src/build.rs");
}
async fn fetch_code() -> String {
return reqwest::get(C_URL).await.unwrap().text().await.unwrap();
}
fn extract_suite(c_code: String) -> String {
return c_code
.split("/* clang-format off */")
.nth(1)
.unwrap()
.split("/* clang-format on */")
.nth(0)
.into_iter()
.collect();
}
// C doesn't have a concept of UTF-8, so we need to ignore some of the malformed
// UTF-8 byte sequences from the C source. one important difference between this
// rust implementation and its C predecessor is that this rust implementation
// _requires_ valid utf-8 strings, whereas the C version will accept any byte
// sequence, and if it cannot decode something as utf-8, it will fallback to
// just using the first octet.
fn skip_malformed(src: String) -> String {
return src
.lines()
.map(|line| {
if line.contains("<<INVALID") || line.contains("test_v(") {
"// ".to_string() + line
} else {
line.to_string()
}
})
.collect::<Vec<String>>()
.join("\n");
}
fn to_unchecked_calls(src: String) -> String {
// "unescape" C string backslashes
let code = src.replace(r"\\", r"\");
// find C strings, group 1 is the content
let str_rx = Regex::new(r#""(.*?)""#).unwrap();
// find either a C octet or a literal char
let oct_rx = Regex::new(r"\\(?P<oct>\d{1,3})|(?P<lit>.)").unwrap();
// for each C string
let fixed = str_rx.replace_all(&code, |st: &Captures| {
let content = &st[1];
// convert each octet/literal to u8
let bytes: Vec<u8> = oct_rx
.captures_iter(content)
.map(|cap| {
if let Some(oct) = cap.name("oct") {
return u8::from_str_radix(oct.into(), 8).unwrap();
} else if let Some(lit) = cap.name("lit") {
return lit.as_str().as_bytes()[0];
} else {
panic!("should never happen, captures: '{:?}'", cap);
}
})
.collect();
// verify that it's a valid utf-8 sequence
let ser = from_utf8(&bytes);
match ser {
Ok(s) => format!("r\"{}\"", s),
Err(e) => format!("<<INVALID({}):{}>>", e, content),
}
});
return fixed.to_string();
}
fn make_rust_source(code: String) -> String {
format!(
r#"
const UWILDMAT_MATCH: Uwildmat = Uwildmat::Match;
const UWILDMAT_FAIL: Uwildmat = Uwildmat::Fail;
const UWILDMAT_POISON: Uwildmat = Uwildmat::Poison;
#[inline]
pub(crate) fn run_inn_test_suite(
test_r: fn(usize, &str, &str, bool),
test_p: fn(usize, &str, &str, Uwildmat),
test_s: fn(usize, &str, &str, bool),
) {{
{source}
}}
"#,
source = code
)
.trim_start()
.to_string()
}