Skip to content

Commit 064f888

Browse files
committedJan 15, 2020
Add unicode table generator
1 parent 8a87b94 commit 064f888

File tree

8 files changed

+564
-8
lines changed

8 files changed

+564
-8
lines changed
 

‎.gitignore

+1-8
Original file line numberDiff line numberDiff line change
@@ -34,14 +34,7 @@ __pycache__/
3434
# Created by default with `src/ci/docker/run.sh`:
3535
/obj/
3636
/rustllvm/
37-
/src/libcore/unicode/DerivedCoreProperties.txt
38-
/src/libcore/unicode/DerivedNormalizationProps.txt
39-
/src/libcore/unicode/PropList.txt
40-
/src/libcore/unicode/ReadMe.txt
41-
/src/libcore/unicode/Scripts.txt
42-
/src/libcore/unicode/SpecialCasing.txt
43-
/src/libcore/unicode/UnicodeData.txt
44-
/src/libcore/unicode/downloaded
37+
/unicode-downloads
4538
/target/
4639
# Generated by compiletest for incremental:
4740
/tmp/

‎Cargo.lock

+17
Original file line numberDiff line numberDiff line change
@@ -4930,6 +4930,16 @@ version = "1.10.0"
49304930
source = "registry+https://github.com/rust-lang/crates.io-index"
49314931
checksum = "612d636f949607bdf9b123b4a6f6d966dedf3ff669f7f045890d3a4a73948169"
49324932

4933+
[[package]]
4934+
name = "ucd-parse"
4935+
version = "0.1.4"
4936+
source = "registry+https://github.com/rust-lang/crates.io-index"
4937+
checksum = "ca6b52bf4da6512f0f07785a04769222e50d29639e7ecd016b7806fd2de306b4"
4938+
dependencies = [
4939+
"lazy_static 1.3.0",
4940+
"regex",
4941+
]
4942+
49334943
[[package]]
49344944
name = "ucd-trie"
49354945
version = "0.1.1"
@@ -4951,6 +4961,13 @@ dependencies = [
49514961
"version_check 0.1.5",
49524962
]
49534963

4964+
[[package]]
4965+
name = "unicode-bdd"
4966+
version = "0.1.0"
4967+
dependencies = [
4968+
"ucd-parse",
4969+
]
4970+
49544971
[[package]]
49554972
name = "unicode-bidi"
49564973
version = "0.3.4"

‎Cargo.toml

+1
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ members = [
2323
"src/tools/rustfmt",
2424
"src/tools/miri",
2525
"src/tools/rustdoc-themes",
26+
"src/tools/unicode-table-generator",
2627
]
2728
exclude = [
2829
"build",
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
[package]
2+
name = "unicode-bdd"
3+
version = "0.1.0"
4+
authors = ["Mark Rousskov <mark.simulacrum@gmail.com>"]
5+
edition = "2018"
6+
7+
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
8+
9+
[dependencies]
10+
ucd-parse = "0.1.3"
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
use crate::{fmt_list, UnicodeData};
2+
use std::fmt;
3+
4+
pub(crate) fn generate_case_mapping(data: &UnicodeData) -> String {
5+
let mut file = String::new();
6+
7+
file.push_str(HEADER.trim_start());
8+
9+
let decl_type = "&[(char, [char; 3])]";
10+
11+
file.push_str(&format!(
12+
"static LOWERCASE_TABLE: {} = &[{}];",
13+
decl_type,
14+
fmt_list(data.to_lower.iter().map(to_mapping))
15+
));
16+
file.push_str("\n\n");
17+
file.push_str(&format!(
18+
"static UPPERCASE_TABLE: {} = &[{}];",
19+
decl_type,
20+
fmt_list(data.to_upper.iter().map(to_mapping))
21+
));
22+
file
23+
}
24+
25+
fn to_mapping((key, (a, b, c)): (&u32, &(u32, u32, u32))) -> (CharEscape, [CharEscape; 3]) {
26+
(
27+
CharEscape(std::char::from_u32(*key).unwrap()),
28+
[
29+
CharEscape(std::char::from_u32(*a).unwrap()),
30+
CharEscape(std::char::from_u32(*b).unwrap()),
31+
CharEscape(std::char::from_u32(*c).unwrap()),
32+
],
33+
)
34+
}
35+
36+
struct CharEscape(char);
37+
38+
impl fmt::Debug for CharEscape {
39+
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
40+
write!(f, "'{}'", self.0.escape_default())
41+
}
42+
}
43+
44+
static HEADER: &str = "
45+
pub fn to_lower(c: char) -> [char; 3] {
46+
match bsearch_case_table(c, LOWERCASE_TABLE) {
47+
None => [c, '\\0', '\\0'],
48+
Some(index) => LOWERCASE_TABLE[index].1,
49+
}
50+
}
51+
52+
pub fn to_upper(c: char) -> [char; 3] {
53+
match bsearch_case_table(c, UPPERCASE_TABLE) {
54+
None => [c, '\\0', '\\0'],
55+
Some(index) => UPPERCASE_TABLE[index].1,
56+
}
57+
}
58+
59+
fn bsearch_case_table(c: char, table: &[(char, [char; 3])]) -> Option<usize> {
60+
table.binary_search_by(|&(key, _)| key.cmp(&c)).ok()
61+
}
62+
";
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,261 @@
1+
use std::collections::{BTreeMap, HashMap};
2+
use std::ops::Range;
3+
use ucd_parse::Codepoints;
4+
5+
mod case_mapping;
6+
mod raw_emitter;
7+
mod unicode_download;
8+
9+
use raw_emitter::{emit_codepoints, RawEmitter};
10+
11+
static PROPERTIES: &[&str] = &[
12+
"Alphabetic",
13+
"Lowercase",
14+
"Uppercase",
15+
"Cased",
16+
"Case_Ignorable",
17+
"Grapheme_Extend",
18+
"White_Space",
19+
"Cc",
20+
"N",
21+
];
22+
23+
struct UnicodeData {
24+
ranges: Vec<(&'static str, Vec<Range<u32>>)>,
25+
to_upper: BTreeMap<u32, (u32, u32, u32)>,
26+
to_lower: BTreeMap<u32, (u32, u32, u32)>,
27+
}
28+
29+
fn to_mapping(origin: u32, codepoints: Vec<ucd_parse::Codepoint>) -> Option<(u32, u32, u32)> {
30+
let mut a = None;
31+
let mut b = None;
32+
let mut c = None;
33+
34+
for codepoint in codepoints {
35+
if origin == codepoint.value() {
36+
return None;
37+
}
38+
39+
if a.is_none() {
40+
a = Some(codepoint.value());
41+
} else if b.is_none() {
42+
b = Some(codepoint.value());
43+
} else if c.is_none() {
44+
c = Some(codepoint.value());
45+
} else {
46+
panic!("more than 3 mapped codepoints")
47+
}
48+
}
49+
50+
Some((a.unwrap(), b.unwrap_or(0), c.unwrap_or(0)))
51+
}
52+
53+
static UNICODE_DIRECTORY: &str = "unicode-downloads";
54+
55+
fn load_data() -> UnicodeData {
56+
unicode_download::fetch_latest();
57+
58+
let mut properties = HashMap::new();
59+
for row in ucd_parse::parse::<_, ucd_parse::CoreProperty>(&UNICODE_DIRECTORY).unwrap() {
60+
if let Some(name) = PROPERTIES.iter().find(|prop| **prop == row.property.as_str()) {
61+
properties.entry(*name).or_insert_with(Vec::new).push(row.codepoints);
62+
}
63+
}
64+
for row in ucd_parse::parse::<_, ucd_parse::Property>(&UNICODE_DIRECTORY).unwrap() {
65+
if let Some(name) = PROPERTIES.iter().find(|prop| **prop == row.property.as_str()) {
66+
properties.entry(*name).or_insert_with(Vec::new).push(row.codepoints);
67+
}
68+
}
69+
70+
let mut to_lower = BTreeMap::new();
71+
let mut to_upper = BTreeMap::new();
72+
for row in ucd_parse::UnicodeDataExpander::new(
73+
ucd_parse::parse::<_, ucd_parse::UnicodeData>(&UNICODE_DIRECTORY).unwrap(),
74+
) {
75+
let general_category = if ["Nd", "Nl", "No"].contains(&row.general_category.as_str()) {
76+
"N"
77+
} else {
78+
row.general_category.as_str()
79+
};
80+
if let Some(name) = PROPERTIES.iter().find(|prop| **prop == general_category) {
81+
properties
82+
.entry(*name)
83+
.or_insert_with(Vec::new)
84+
.push(Codepoints::Single(row.codepoint));
85+
}
86+
87+
if let Some(mapped) = row.simple_lowercase_mapping {
88+
if mapped != row.codepoint {
89+
to_lower.insert(row.codepoint.value(), (mapped.value(), 0, 0));
90+
}
91+
}
92+
if let Some(mapped) = row.simple_uppercase_mapping {
93+
if mapped != row.codepoint {
94+
to_upper.insert(row.codepoint.value(), (mapped.value(), 0, 0));
95+
}
96+
}
97+
}
98+
99+
for row in ucd_parse::parse::<_, ucd_parse::SpecialCaseMapping>(&UNICODE_DIRECTORY).unwrap() {
100+
if !row.conditions.is_empty() {
101+
// Skip conditional case mappings
102+
continue;
103+
}
104+
105+
let key = row.codepoint.value();
106+
if let Some(lower) = to_mapping(key, row.lowercase) {
107+
to_lower.insert(key, lower);
108+
}
109+
if let Some(upper) = to_mapping(key, row.uppercase) {
110+
to_upper.insert(key, upper);
111+
}
112+
}
113+
114+
let mut properties: HashMap<&'static str, Vec<Range<u32>>> = properties
115+
.into_iter()
116+
.map(|(k, v)| {
117+
(
118+
k,
119+
v.into_iter()
120+
.flat_map(|codepoints| match codepoints {
121+
Codepoints::Single(c) => c
122+
.scalar()
123+
.map(|ch| (ch as u32..ch as u32 + 1))
124+
.into_iter()
125+
.collect::<Vec<_>>(),
126+
Codepoints::Range(c) => c
127+
.into_iter()
128+
.flat_map(|c| c.scalar().map(|ch| (ch as u32..ch as u32 + 1)))
129+
.collect::<Vec<_>>(),
130+
})
131+
.collect::<Vec<Range<u32>>>(),
132+
)
133+
})
134+
.collect();
135+
136+
for ranges in properties.values_mut() {
137+
merge_ranges(ranges);
138+
}
139+
140+
let mut properties = properties.into_iter().collect::<Vec<_>>();
141+
properties.sort_by_key(|p| p.0);
142+
UnicodeData { ranges: properties, to_lower, to_upper }
143+
}
144+
145+
fn main() {
146+
let write_location = std::env::args().nth(1).unwrap_or_else(|| {
147+
eprintln!("Must provide path to write unicode tables to");
148+
eprintln!(
149+
"e.g. {} src/libcore/unicode/unicode_data.rs",
150+
std::env::args().nth(0).unwrap_or_default()
151+
);
152+
std::process::exit(1);
153+
});
154+
155+
let unicode_data = load_data();
156+
let ranges_by_property = &unicode_data.ranges;
157+
158+
let mut total_bytes = 0;
159+
let mut modules = Vec::new();
160+
for (property, ranges) in ranges_by_property {
161+
let datapoints = ranges.iter().map(|r| r.end - r.start).sum::<u32>();
162+
let mut emitter = RawEmitter::new();
163+
emit_codepoints(&mut emitter, &ranges);
164+
165+
modules.push((property.to_lowercase().to_string(), emitter.file));
166+
println!("{:15}: {} bytes, {} codepoints", property, emitter.bytes_used, datapoints,);
167+
total_bytes += emitter.bytes_used;
168+
}
169+
170+
let mut table_file = String::new();
171+
172+
table_file.push_str(
173+
"///! This file is generated by src/tools/unicode-table-generator; do not edit manually!\n",
174+
);
175+
176+
table_file.push_str("use super::range_search;\n\n");
177+
178+
table_file.push_str(&version());
179+
180+
table_file.push('\n');
181+
182+
modules.push((String::from("conversions"), case_mapping::generate_case_mapping(&unicode_data)));
183+
184+
for (name, contents) in modules {
185+
table_file.push_str("#[rustfmt::skip]\n");
186+
table_file.push_str(&format!("pub mod {} {{\n", name));
187+
for line in contents.lines() {
188+
if !line.trim().is_empty() {
189+
table_file.push_str(" ");
190+
table_file.push_str(&line);
191+
}
192+
table_file.push('\n');
193+
}
194+
table_file.push_str("}\n\n");
195+
}
196+
197+
std::fs::write(&write_location, format!("{}\n", table_file.trim_end())).unwrap();
198+
199+
println!("Total table sizes: {} bytes", total_bytes);
200+
}
201+
202+
fn version() -> String {
203+
let mut out = String::new();
204+
out.push_str("pub const UNICODE_VERSION: (u32, u32, u32) = ");
205+
206+
let readme =
207+
std::fs::read_to_string(std::path::Path::new(UNICODE_DIRECTORY).join("ReadMe.txt"))
208+
.unwrap();
209+
210+
let prefix = "for Version ";
211+
let start = readme.find(prefix).unwrap() + prefix.len();
212+
let end = readme.find(" of the Unicode Standard.").unwrap();
213+
let version =
214+
readme[start..end].split('.').map(|v| v.parse::<u32>().expect(&v)).collect::<Vec<_>>();
215+
let [major, minor, micro] = [version[0], version[1], version[2]];
216+
217+
out.push_str(&format!("({}, {}, {});\n", major, minor, micro));
218+
out
219+
}
220+
221+
fn fmt_list<V: std::fmt::Debug>(values: impl IntoIterator<Item = V>) -> String {
222+
let pieces = values.into_iter().map(|b| format!("{:?}, ", b)).collect::<Vec<_>>();
223+
let mut out = String::new();
224+
let mut line = format!("\n ");
225+
for piece in pieces {
226+
if line.len() + piece.len() < 98 {
227+
line.push_str(&piece);
228+
} else {
229+
out.push_str(line.trim_end());
230+
out.push('\n');
231+
line = format!(" {}", piece);
232+
}
233+
}
234+
out.push_str(line.trim_end());
235+
out.push('\n');
236+
out
237+
}
238+
239+
fn merge_ranges(ranges: &mut Vec<Range<u32>>) {
240+
loop {
241+
let mut new_ranges = Vec::new();
242+
let mut idx_iter = 0..(ranges.len() - 1);
243+
while let Some(idx) = idx_iter.next() {
244+
let cur = ranges[idx].clone();
245+
let next = ranges[idx + 1].clone();
246+
if cur.end == next.start {
247+
let _ = idx_iter.next(); // skip next as we're merging it in
248+
new_ranges.push(cur.start..next.end);
249+
} else {
250+
new_ranges.push(cur);
251+
}
252+
}
253+
new_ranges.push(ranges.last().unwrap().clone());
254+
if new_ranges.len() == ranges.len() {
255+
*ranges = new_ranges;
256+
break;
257+
} else {
258+
*ranges = new_ranges;
259+
}
260+
}
261+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,170 @@
1+
//! This implements the core logic of the compression scheme used to compactly
2+
//! encode the Unicode character classes.
3+
//!
4+
//! The primary idea is that we 'flatten' the Unicode ranges into an enormous
5+
//! bitset. To represent any arbitrary codepoint in a raw bitset, we would need
6+
//! over 17 kilobytes of data per character set -- way too much for our
7+
//! purposes.
8+
//!
9+
//! We have two primary goals with the encoding: we want to be compact, because
10+
//! these tables often end up in ~every Rust program (especially the
11+
//! grapheme_extend table, used for str debugging), including those for embedded
12+
//! targets (where space is important). We also want to be relatively fast,
13+
//! though this is more of a nice to have rather than a key design constraint.
14+
//! In practice, due to modern processor design these two are closely related.
15+
//!
16+
//! The encoding scheme here compresses the bitset by first deduplicating the
17+
//! "words" (64 bits on all platforms). In practice very few words are present
18+
//! in most data sets.
19+
//!
20+
//! This gives us an array that maps `u8 -> word` (if we ever went beyond 256
21+
//! words, we could go to u16 -> word or have some dual compression scheme
22+
//! mapping into two separate sets; currently this is not dealt with).
23+
//!
24+
//! With that scheme, we now have a single byte for every 64 codepoints. We
25+
//! further group these by 16 (arbitrarily chosen), and again deduplicate and
26+
//! store in an array (u8 -> [u8; 16]).
27+
//!
28+
//! The indices into this array represent ranges of 64*16 = 1024 codepoints.
29+
//!
30+
//! This already reduces the top-level array to at most 1,086 bytes, but in
31+
//! practice we usually can encode in far fewer (the first couple Unicode planes
32+
//! are dense).
33+
//!
34+
//! The last byte of this top-level array is pulled out to a separate static
35+
//! and trailing zeros are dropped; this is simply because grapheme_extend and
36+
//! case_ignorable have a single entry in the 896th entry, so this shrinks them
37+
//! down considerably.
38+
39+
use crate::fmt_list;
40+
use std::collections::{BTreeSet, HashMap};
41+
use std::convert::TryFrom;
42+
use std::fmt::Write;
43+
use std::ops::Range;
44+
45+
pub struct RawEmitter {
46+
pub file: String,
47+
pub bytes_used: usize,
48+
}
49+
50+
impl RawEmitter {
51+
pub fn new() -> RawEmitter {
52+
RawEmitter { file: String::new(), bytes_used: 0 }
53+
}
54+
55+
fn blank_line(&mut self) {
56+
if self.file.is_empty() || self.file.ends_with("\n\n") {
57+
return;
58+
}
59+
writeln!(&mut self.file, "").unwrap();
60+
}
61+
62+
fn emit_bitset(&mut self, words: &[u64]) {
63+
let unique_words =
64+
words.iter().cloned().collect::<BTreeSet<_>>().into_iter().collect::<Vec<_>>();
65+
if unique_words.len() > u8::max_value() as usize {
66+
panic!("cannot pack {} into 8 bits", unique_words.len());
67+
}
68+
69+
let word_indices = unique_words
70+
.iter()
71+
.cloned()
72+
.enumerate()
73+
.map(|(idx, word)| (word, u8::try_from(idx).unwrap()))
74+
.collect::<HashMap<_, _>>();
75+
76+
let mut idx = words.iter().map(|w| word_indices[w]).collect::<Vec<u8>>();
77+
let chunk_length = 16;
78+
for _ in 0..(chunk_length - (idx.len() % chunk_length)) {
79+
assert_eq!(unique_words[0], 0, "first word is all zeros");
80+
// pad out bitset index with zero words so we have all chunks of 16
81+
idx.push(0);
82+
}
83+
84+
let mut chunks = BTreeSet::new();
85+
for chunk in idx.chunks(chunk_length) {
86+
chunks.insert(chunk);
87+
}
88+
let chunk_map = chunks
89+
.clone()
90+
.into_iter()
91+
.enumerate()
92+
.map(|(idx, chunk)| (chunk, idx))
93+
.collect::<HashMap<_, _>>();
94+
let mut chunk_indices = Vec::new();
95+
for chunk in idx.chunks(chunk_length) {
96+
chunk_indices.push(chunk_map[chunk]);
97+
}
98+
writeln!(
99+
&mut self.file,
100+
"static BITSET_LAST_CHUNK_MAP: (u16, u8) = ({}, {});",
101+
chunk_indices.len() - 1,
102+
chunk_indices.pop().unwrap(),
103+
)
104+
.unwrap();
105+
self.bytes_used += 3;
106+
// Strip out the empty pieces, presuming our above pop() made us now
107+
// have some trailing zeros.
108+
assert_eq!(unique_words[0], 0, "first word is all zeros");
109+
while let Some(0) = chunk_indices.last() {
110+
chunk_indices.pop();
111+
}
112+
writeln!(
113+
&mut self.file,
114+
"static BITSET_CHUNKS_MAP: [u8; {}] = [{}];",
115+
chunk_indices.len(),
116+
fmt_list(&chunk_indices),
117+
)
118+
.unwrap();
119+
self.bytes_used += chunk_indices.len();
120+
writeln!(
121+
&mut self.file,
122+
"static BITSET_INDEX_CHUNKS: [[u8; 16]; {}] = [{}];",
123+
chunks.len(),
124+
fmt_list(chunks.iter()),
125+
)
126+
.unwrap();
127+
self.bytes_used += 16 * chunks.len();
128+
writeln!(
129+
&mut self.file,
130+
"static BITSET: [u64; {}] = [{}];",
131+
unique_words.len(),
132+
fmt_list(&unique_words),
133+
)
134+
.unwrap();
135+
self.bytes_used += 8 * unique_words.len();
136+
}
137+
138+
pub fn emit_lookup(&mut self) {
139+
writeln!(&mut self.file, "pub fn lookup(c: char) -> bool {{").unwrap();
140+
writeln!(&mut self.file, " super::range_search(",).unwrap();
141+
writeln!(&mut self.file, " c as u32,").unwrap();
142+
writeln!(&mut self.file, " &BITSET_CHUNKS_MAP,").unwrap();
143+
writeln!(&mut self.file, " BITSET_LAST_CHUNK_MAP,").unwrap();
144+
writeln!(&mut self.file, " &BITSET_INDEX_CHUNKS,").unwrap();
145+
writeln!(&mut self.file, " &BITSET,").unwrap();
146+
writeln!(&mut self.file, " )").unwrap();
147+
writeln!(&mut self.file, "}}").unwrap();
148+
}
149+
}
150+
151+
pub fn emit_codepoints(emitter: &mut RawEmitter, ranges: &[Range<u32>]) {
152+
emitter.blank_line();
153+
154+
let last_code_point = ranges.last().unwrap().end;
155+
// bitset for every bit in the codepoint range
156+
//
157+
// + 2 to ensure an all zero word to use for padding
158+
let mut buckets = vec![0u64; (last_code_point as usize / 64) + 2];
159+
for range in ranges {
160+
for codepoint in range.clone() {
161+
let bucket = codepoint as usize / 64;
162+
let bit = codepoint as u64 % 64;
163+
buckets[bucket] |= 1 << bit;
164+
}
165+
}
166+
167+
emitter.emit_bitset(&buckets);
168+
emitter.blank_line();
169+
emitter.emit_lookup();
170+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
use crate::UNICODE_DIRECTORY;
2+
use std::path::Path;
3+
use std::process::Command;
4+
5+
static URL_PREFIX: &str = "https://www.unicode.org/Public/UCD/latest/ucd/";
6+
7+
static README: &str = "ReadMe.txt";
8+
9+
static RESOURCES: &[&str] =
10+
&["DerivedCoreProperties.txt", "PropList.txt", "UnicodeData.txt", "SpecialCasing.txt"];
11+
12+
pub fn fetch_latest() {
13+
let directory = Path::new(UNICODE_DIRECTORY);
14+
if let Err(e) = std::fs::create_dir_all(directory) {
15+
if e.kind() != std::io::ErrorKind::AlreadyExists {
16+
panic!("Failed to create {:?}: {}", UNICODE_DIRECTORY, e);
17+
}
18+
}
19+
let output = Command::new("curl").arg(URL_PREFIX.to_owned() + README).output().unwrap();
20+
if !output.status.success() {
21+
panic!(
22+
"Failed to run curl to fetch readme: stderr: {}",
23+
String::from_utf8_lossy(&output.stderr)
24+
);
25+
}
26+
let current = std::fs::read_to_string(directory.join(README)).unwrap_or_default();
27+
if current.as_bytes() != &output.stdout[..] {
28+
std::fs::write(directory.join(README), output.stdout).unwrap();
29+
}
30+
31+
for resource in RESOURCES {
32+
let output = Command::new("curl").arg(URL_PREFIX.to_owned() + resource).output().unwrap();
33+
if !output.status.success() {
34+
panic!(
35+
"Failed to run curl to fetch {}: stderr: {}",
36+
resource,
37+
String::from_utf8_lossy(&output.stderr)
38+
);
39+
}
40+
std::fs::write(directory.join(resource), output.stdout).unwrap();
41+
}
42+
}

0 commit comments

Comments
 (0)
Please sign in to comment.