|
| 1 | +use std::collections::{BTreeMap, HashMap}; |
| 2 | +use std::ops::Range; |
| 3 | +use ucd_parse::Codepoints; |
| 4 | + |
| 5 | +mod case_mapping; |
| 6 | +mod raw_emitter; |
| 7 | +mod unicode_download; |
| 8 | + |
| 9 | +use raw_emitter::{emit_codepoints, RawEmitter}; |
| 10 | + |
| 11 | +static PROPERTIES: &[&str] = &[ |
| 12 | + "Alphabetic", |
| 13 | + "Lowercase", |
| 14 | + "Uppercase", |
| 15 | + "Cased", |
| 16 | + "Case_Ignorable", |
| 17 | + "Grapheme_Extend", |
| 18 | + "White_Space", |
| 19 | + "Cc", |
| 20 | + "N", |
| 21 | +]; |
| 22 | + |
| 23 | +struct UnicodeData { |
| 24 | + ranges: Vec<(&'static str, Vec<Range<u32>>)>, |
| 25 | + to_upper: BTreeMap<u32, (u32, u32, u32)>, |
| 26 | + to_lower: BTreeMap<u32, (u32, u32, u32)>, |
| 27 | +} |
| 28 | + |
| 29 | +fn to_mapping(origin: u32, codepoints: Vec<ucd_parse::Codepoint>) -> Option<(u32, u32, u32)> { |
| 30 | + let mut a = None; |
| 31 | + let mut b = None; |
| 32 | + let mut c = None; |
| 33 | + |
| 34 | + for codepoint in codepoints { |
| 35 | + if origin == codepoint.value() { |
| 36 | + return None; |
| 37 | + } |
| 38 | + |
| 39 | + if a.is_none() { |
| 40 | + a = Some(codepoint.value()); |
| 41 | + } else if b.is_none() { |
| 42 | + b = Some(codepoint.value()); |
| 43 | + } else if c.is_none() { |
| 44 | + c = Some(codepoint.value()); |
| 45 | + } else { |
| 46 | + panic!("more than 3 mapped codepoints") |
| 47 | + } |
| 48 | + } |
| 49 | + |
| 50 | + Some((a.unwrap(), b.unwrap_or(0), c.unwrap_or(0))) |
| 51 | +} |
| 52 | + |
| 53 | +static UNICODE_DIRECTORY: &str = "unicode-downloads"; |
| 54 | + |
| 55 | +fn load_data() -> UnicodeData { |
| 56 | + unicode_download::fetch_latest(); |
| 57 | + |
| 58 | + let mut properties = HashMap::new(); |
| 59 | + for row in ucd_parse::parse::<_, ucd_parse::CoreProperty>(&UNICODE_DIRECTORY).unwrap() { |
| 60 | + if let Some(name) = PROPERTIES.iter().find(|prop| **prop == row.property.as_str()) { |
| 61 | + properties.entry(*name).or_insert_with(Vec::new).push(row.codepoints); |
| 62 | + } |
| 63 | + } |
| 64 | + for row in ucd_parse::parse::<_, ucd_parse::Property>(&UNICODE_DIRECTORY).unwrap() { |
| 65 | + if let Some(name) = PROPERTIES.iter().find(|prop| **prop == row.property.as_str()) { |
| 66 | + properties.entry(*name).or_insert_with(Vec::new).push(row.codepoints); |
| 67 | + } |
| 68 | + } |
| 69 | + |
| 70 | + let mut to_lower = BTreeMap::new(); |
| 71 | + let mut to_upper = BTreeMap::new(); |
| 72 | + for row in ucd_parse::UnicodeDataExpander::new( |
| 73 | + ucd_parse::parse::<_, ucd_parse::UnicodeData>(&UNICODE_DIRECTORY).unwrap(), |
| 74 | + ) { |
| 75 | + let general_category = if ["Nd", "Nl", "No"].contains(&row.general_category.as_str()) { |
| 76 | + "N" |
| 77 | + } else { |
| 78 | + row.general_category.as_str() |
| 79 | + }; |
| 80 | + if let Some(name) = PROPERTIES.iter().find(|prop| **prop == general_category) { |
| 81 | + properties |
| 82 | + .entry(*name) |
| 83 | + .or_insert_with(Vec::new) |
| 84 | + .push(Codepoints::Single(row.codepoint)); |
| 85 | + } |
| 86 | + |
| 87 | + if let Some(mapped) = row.simple_lowercase_mapping { |
| 88 | + if mapped != row.codepoint { |
| 89 | + to_lower.insert(row.codepoint.value(), (mapped.value(), 0, 0)); |
| 90 | + } |
| 91 | + } |
| 92 | + if let Some(mapped) = row.simple_uppercase_mapping { |
| 93 | + if mapped != row.codepoint { |
| 94 | + to_upper.insert(row.codepoint.value(), (mapped.value(), 0, 0)); |
| 95 | + } |
| 96 | + } |
| 97 | + } |
| 98 | + |
| 99 | + for row in ucd_parse::parse::<_, ucd_parse::SpecialCaseMapping>(&UNICODE_DIRECTORY).unwrap() { |
| 100 | + if !row.conditions.is_empty() { |
| 101 | + // Skip conditional case mappings |
| 102 | + continue; |
| 103 | + } |
| 104 | + |
| 105 | + let key = row.codepoint.value(); |
| 106 | + if let Some(lower) = to_mapping(key, row.lowercase) { |
| 107 | + to_lower.insert(key, lower); |
| 108 | + } |
| 109 | + if let Some(upper) = to_mapping(key, row.uppercase) { |
| 110 | + to_upper.insert(key, upper); |
| 111 | + } |
| 112 | + } |
| 113 | + |
| 114 | + let mut properties: HashMap<&'static str, Vec<Range<u32>>> = properties |
| 115 | + .into_iter() |
| 116 | + .map(|(k, v)| { |
| 117 | + ( |
| 118 | + k, |
| 119 | + v.into_iter() |
| 120 | + .flat_map(|codepoints| match codepoints { |
| 121 | + Codepoints::Single(c) => c |
| 122 | + .scalar() |
| 123 | + .map(|ch| (ch as u32..ch as u32 + 1)) |
| 124 | + .into_iter() |
| 125 | + .collect::<Vec<_>>(), |
| 126 | + Codepoints::Range(c) => c |
| 127 | + .into_iter() |
| 128 | + .flat_map(|c| c.scalar().map(|ch| (ch as u32..ch as u32 + 1))) |
| 129 | + .collect::<Vec<_>>(), |
| 130 | + }) |
| 131 | + .collect::<Vec<Range<u32>>>(), |
| 132 | + ) |
| 133 | + }) |
| 134 | + .collect(); |
| 135 | + |
| 136 | + for ranges in properties.values_mut() { |
| 137 | + merge_ranges(ranges); |
| 138 | + } |
| 139 | + |
| 140 | + let mut properties = properties.into_iter().collect::<Vec<_>>(); |
| 141 | + properties.sort_by_key(|p| p.0); |
| 142 | + UnicodeData { ranges: properties, to_lower, to_upper } |
| 143 | +} |
| 144 | + |
| 145 | +fn main() { |
| 146 | + let write_location = std::env::args().nth(1).unwrap_or_else(|| { |
| 147 | + eprintln!("Must provide path to write unicode tables to"); |
| 148 | + eprintln!( |
| 149 | + "e.g. {} src/libcore/unicode/unicode_data.rs", |
| 150 | + std::env::args().nth(0).unwrap_or_default() |
| 151 | + ); |
| 152 | + std::process::exit(1); |
| 153 | + }); |
| 154 | + |
| 155 | + let unicode_data = load_data(); |
| 156 | + let ranges_by_property = &unicode_data.ranges; |
| 157 | + |
| 158 | + let mut total_bytes = 0; |
| 159 | + let mut modules = Vec::new(); |
| 160 | + for (property, ranges) in ranges_by_property { |
| 161 | + let datapoints = ranges.iter().map(|r| r.end - r.start).sum::<u32>(); |
| 162 | + let mut emitter = RawEmitter::new(); |
| 163 | + emit_codepoints(&mut emitter, &ranges); |
| 164 | + |
| 165 | + modules.push((property.to_lowercase().to_string(), emitter.file)); |
| 166 | + println!("{:15}: {} bytes, {} codepoints", property, emitter.bytes_used, datapoints,); |
| 167 | + total_bytes += emitter.bytes_used; |
| 168 | + } |
| 169 | + |
| 170 | + let mut table_file = String::new(); |
| 171 | + |
| 172 | + table_file.push_str( |
| 173 | + "///! This file is generated by src/tools/unicode-table-generator; do not edit manually!\n", |
| 174 | + ); |
| 175 | + |
| 176 | + table_file.push_str("use super::range_search;\n\n"); |
| 177 | + |
| 178 | + table_file.push_str(&version()); |
| 179 | + |
| 180 | + table_file.push('\n'); |
| 181 | + |
| 182 | + modules.push((String::from("conversions"), case_mapping::generate_case_mapping(&unicode_data))); |
| 183 | + |
| 184 | + for (name, contents) in modules { |
| 185 | + table_file.push_str("#[rustfmt::skip]\n"); |
| 186 | + table_file.push_str(&format!("pub mod {} {{\n", name)); |
| 187 | + for line in contents.lines() { |
| 188 | + if !line.trim().is_empty() { |
| 189 | + table_file.push_str(" "); |
| 190 | + table_file.push_str(&line); |
| 191 | + } |
| 192 | + table_file.push('\n'); |
| 193 | + } |
| 194 | + table_file.push_str("}\n\n"); |
| 195 | + } |
| 196 | + |
| 197 | + std::fs::write(&write_location, format!("{}\n", table_file.trim_end())).unwrap(); |
| 198 | + |
| 199 | + println!("Total table sizes: {} bytes", total_bytes); |
| 200 | +} |
| 201 | + |
| 202 | +fn version() -> String { |
| 203 | + let mut out = String::new(); |
| 204 | + out.push_str("pub const UNICODE_VERSION: (u32, u32, u32) = "); |
| 205 | + |
| 206 | + let readme = |
| 207 | + std::fs::read_to_string(std::path::Path::new(UNICODE_DIRECTORY).join("ReadMe.txt")) |
| 208 | + .unwrap(); |
| 209 | + |
| 210 | + let prefix = "for Version "; |
| 211 | + let start = readme.find(prefix).unwrap() + prefix.len(); |
| 212 | + let end = readme.find(" of the Unicode Standard.").unwrap(); |
| 213 | + let version = |
| 214 | + readme[start..end].split('.').map(|v| v.parse::<u32>().expect(&v)).collect::<Vec<_>>(); |
| 215 | + let [major, minor, micro] = [version[0], version[1], version[2]]; |
| 216 | + |
| 217 | + out.push_str(&format!("({}, {}, {});\n", major, minor, micro)); |
| 218 | + out |
| 219 | +} |
| 220 | + |
| 221 | +fn fmt_list<V: std::fmt::Debug>(values: impl IntoIterator<Item = V>) -> String { |
| 222 | + let pieces = values.into_iter().map(|b| format!("{:?}, ", b)).collect::<Vec<_>>(); |
| 223 | + let mut out = String::new(); |
| 224 | + let mut line = format!("\n "); |
| 225 | + for piece in pieces { |
| 226 | + if line.len() + piece.len() < 98 { |
| 227 | + line.push_str(&piece); |
| 228 | + } else { |
| 229 | + out.push_str(line.trim_end()); |
| 230 | + out.push('\n'); |
| 231 | + line = format!(" {}", piece); |
| 232 | + } |
| 233 | + } |
| 234 | + out.push_str(line.trim_end()); |
| 235 | + out.push('\n'); |
| 236 | + out |
| 237 | +} |
| 238 | + |
| 239 | +fn merge_ranges(ranges: &mut Vec<Range<u32>>) { |
| 240 | + loop { |
| 241 | + let mut new_ranges = Vec::new(); |
| 242 | + let mut idx_iter = 0..(ranges.len() - 1); |
| 243 | + while let Some(idx) = idx_iter.next() { |
| 244 | + let cur = ranges[idx].clone(); |
| 245 | + let next = ranges[idx + 1].clone(); |
| 246 | + if cur.end == next.start { |
| 247 | + let _ = idx_iter.next(); // skip next as we're merging it in |
| 248 | + new_ranges.push(cur.start..next.end); |
| 249 | + } else { |
| 250 | + new_ranges.push(cur); |
| 251 | + } |
| 252 | + } |
| 253 | + new_ranges.push(ranges.last().unwrap().clone()); |
| 254 | + if new_ranges.len() == ranges.len() { |
| 255 | + *ranges = new_ranges; |
| 256 | + break; |
| 257 | + } else { |
| 258 | + *ranges = new_ranges; |
| 259 | + } |
| 260 | + } |
| 261 | +} |
0 commit comments