|
| 1 | +// Copyright 2018 The Rust Project Developers. See the COPYRIGHT |
| 2 | +// file at the top-level directory of this distribution and at |
| 3 | +// http://rust-lang.org/COPYRIGHT. |
| 4 | +// |
| 5 | +// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or |
| 6 | +// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license |
| 7 | +// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your |
| 8 | +// option. This file may not be copied, modified, or distributed |
| 9 | +// except according to those terms. |
| 10 | + |
| 11 | +use unicode_width::UnicodeWidthChar; |
| 12 | +use super::*; |
| 13 | + |
| 14 | +/// Find all newlines, multi-byte characters, and non-narrow characters in a |
| 15 | +/// FileMap. |
| 16 | +/// |
| 17 | +/// This function will use an SSE2 enhanced implementation if hardware support |
| 18 | +/// is detected at runtime. |
| 19 | +pub fn analyze_filemap( |
| 20 | + src: &str, |
| 21 | + filemap_start_pos: BytePos) |
| 22 | + -> (Vec<BytePos>, Vec<MultiByteChar>, Vec<NonNarrowChar>) |
| 23 | +{ |
| 24 | + let mut lines = vec![filemap_start_pos]; |
| 25 | + let mut multi_byte_chars = vec![]; |
| 26 | + let mut non_narrow_chars = vec![]; |
| 27 | + |
| 28 | + // Calls the right implementation, depending on hardware support available. |
| 29 | + analyze_filemap_dispatch(src, |
| 30 | + filemap_start_pos, |
| 31 | + &mut lines, |
| 32 | + &mut multi_byte_chars, |
| 33 | + &mut non_narrow_chars); |
| 34 | + |
| 35 | + // The code above optimistically registers a new line *after* each \n |
| 36 | + // it encounters. If that point is already outside the filemap, remove |
| 37 | + // it again. |
| 38 | + if let Some(&last_line_start) = lines.last() { |
| 39 | + let file_map_end = filemap_start_pos + BytePos::from_usize(src.len()); |
| 40 | + assert!(file_map_end >= last_line_start); |
| 41 | + if last_line_start == file_map_end { |
| 42 | + lines.pop(); |
| 43 | + } |
| 44 | + } |
| 45 | + |
| 46 | + (lines, multi_byte_chars, non_narrow_chars) |
| 47 | +} |
| 48 | + |
| 49 | +cfg_if! { |
| 50 | + if #[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), |
| 51 | + not(stage0)))] { |
| 52 | + fn analyze_filemap_dispatch(src: &str, |
| 53 | + filemap_start_pos: BytePos, |
| 54 | + lines: &mut Vec<BytePos>, |
| 55 | + multi_byte_chars: &mut Vec<MultiByteChar>, |
| 56 | + non_narrow_chars: &mut Vec<NonNarrowChar>) { |
| 57 | + if is_x86_feature_detected!("sse2") { |
| 58 | + unsafe { |
| 59 | + analyze_filemap_sse2(src, |
| 60 | + filemap_start_pos, |
| 61 | + lines, |
| 62 | + multi_byte_chars, |
| 63 | + non_narrow_chars); |
| 64 | + } |
| 65 | + } else { |
| 66 | + analyze_filemap_generic(src, |
| 67 | + src.len(), |
| 68 | + filemap_start_pos, |
| 69 | + lines, |
| 70 | + multi_byte_chars, |
| 71 | + non_narrow_chars); |
| 72 | + |
| 73 | + } |
| 74 | + } |
| 75 | + |
| 76 | + /// Check 16 byte chunks of text at a time. If the chunk contains |
| 77 | + /// something other than printable ASCII characters and newlines, the |
| 78 | + /// function falls back to the generic implementation. Otherwise it uses |
| 79 | + /// SSE2 intrinsics to quickly find all newlines. |
| 80 | + #[target_feature(enable = "sse2")] |
| 81 | + unsafe fn analyze_filemap_sse2(src: &str, |
| 82 | + output_offset: BytePos, |
| 83 | + lines: &mut Vec<BytePos>, |
| 84 | + multi_byte_chars: &mut Vec<MultiByteChar>, |
| 85 | + non_narrow_chars: &mut Vec<NonNarrowChar>) { |
| 86 | + #[cfg(target_arch = "x86")] |
| 87 | + use std::arch::x86::*; |
| 88 | + #[cfg(target_arch = "x86_64")] |
| 89 | + use std::arch::x86_64::*; |
| 90 | + |
| 91 | + const CHUNK_SIZE: usize = 16; |
| 92 | + |
| 93 | + let src_bytes = src.as_bytes(); |
| 94 | + |
| 95 | + let chunk_count = src.len() / CHUNK_SIZE; |
| 96 | + |
| 97 | + // This variable keeps track of where we should start decoding a |
| 98 | + // chunk. If a multi-byte character spans across chunk boundaries, |
| 99 | + // we need to skip that part in the next chunk because we already |
| 100 | + // handled it. |
| 101 | + let mut intra_chunk_offset = 0; |
| 102 | + |
| 103 | + for chunk_index in 0 .. chunk_count { |
| 104 | + let ptr = src_bytes.as_ptr() as *const __m128i; |
| 105 | + let chunk = _mm_loadu_si128(ptr.offset(chunk_index as isize)); |
| 106 | + |
| 107 | + // For character in the chunk, see if its byte value is < 0, which |
| 108 | + // indicates that it's part of a UTF-8 char. |
| 109 | + let multibyte_test = _mm_cmplt_epi8(chunk, _mm_set1_epi8(0)); |
| 110 | + // Create a bit mask from the comparison results. |
| 111 | + let multibyte_mask = _mm_movemask_epi8(multibyte_test); |
| 112 | + |
| 113 | + // If the bit mask is all zero, we only have ASCII chars here: |
| 114 | + if multibyte_mask == 0 { |
| 115 | + assert!(intra_chunk_offset == 0); |
| 116 | + |
| 117 | + // Check if there are any control characters in the chunk. All |
| 118 | + // control characters that we can encounter at this point have a |
| 119 | + // byte value less than 32 or ... |
| 120 | + let control_char_test0 = _mm_cmplt_epi8(chunk, _mm_set1_epi8(32)); |
| 121 | + let control_char_mask0 = _mm_movemask_epi8(control_char_test0); |
| 122 | + |
| 123 | + // ... it's the ASCII 'DEL' character with a value of 127. |
| 124 | + let control_char_test1 = _mm_cmpeq_epi8(chunk, _mm_set1_epi8(127)); |
| 125 | + let control_char_mask1 = _mm_movemask_epi8(control_char_test1); |
| 126 | + |
| 127 | + let control_char_mask = control_char_mask0 | control_char_mask1; |
| 128 | + |
| 129 | + if control_char_mask != 0 { |
| 130 | + // Check for newlines in the chunk |
| 131 | + let newlines_test = _mm_cmpeq_epi8(chunk, _mm_set1_epi8(b'\n' as i8)); |
| 132 | + let newlines_mask = _mm_movemask_epi8(newlines_test); |
| 133 | + |
| 134 | + if control_char_mask == newlines_mask { |
| 135 | + // All control characters are newlines, record them |
| 136 | + let mut newlines_mask = 0xFFFF0000 | newlines_mask as u32; |
| 137 | + let output_offset = output_offset + |
| 138 | + BytePos::from_usize(chunk_index * CHUNK_SIZE + 1); |
| 139 | + |
| 140 | + loop { |
| 141 | + let index = newlines_mask.trailing_zeros(); |
| 142 | + |
| 143 | + if index >= CHUNK_SIZE as u32 { |
| 144 | + // We have arrived at the end of the chunk. |
| 145 | + break |
| 146 | + } |
| 147 | + |
| 148 | + lines.push(BytePos(index) + output_offset); |
| 149 | + |
| 150 | + // Clear the bit, so we can find the next one. |
| 151 | + newlines_mask &= (!1) << index; |
| 152 | + } |
| 153 | + |
| 154 | + // We are done for this chunk. All control characters were |
| 155 | + // newlines and we took care of those. |
| 156 | + continue |
| 157 | + } else { |
| 158 | + // Some of the control characters are not newlines, |
| 159 | + // fall through to the slow path below. |
| 160 | + } |
| 161 | + } else { |
| 162 | + // No control characters, nothing to record for this chunk |
| 163 | + continue |
| 164 | + } |
| 165 | + } |
| 166 | + |
| 167 | + // The slow path. |
| 168 | + // There are control chars in here, fallback to generic decoding. |
| 169 | + let scan_start = chunk_index * CHUNK_SIZE + intra_chunk_offset; |
| 170 | + intra_chunk_offset = analyze_filemap_generic( |
| 171 | + &src[scan_start .. ], |
| 172 | + CHUNK_SIZE - intra_chunk_offset, |
| 173 | + BytePos::from_usize(scan_start) + output_offset, |
| 174 | + lines, |
| 175 | + multi_byte_chars, |
| 176 | + non_narrow_chars |
| 177 | + ); |
| 178 | + } |
| 179 | + |
| 180 | + // There might still be a tail left to analyze |
| 181 | + let tail_start = chunk_count * CHUNK_SIZE + intra_chunk_offset; |
| 182 | + if tail_start < src.len() { |
| 183 | + analyze_filemap_generic(&src[tail_start as usize ..], |
| 184 | + src.len() - tail_start, |
| 185 | + output_offset + BytePos::from_usize(tail_start), |
| 186 | + lines, |
| 187 | + multi_byte_chars, |
| 188 | + non_narrow_chars); |
| 189 | + } |
| 190 | + } |
| 191 | + } else { |
| 192 | + |
| 193 | + // The target (or compiler version) does not support SSE2 ... |
| 194 | + fn analyze_filemap_dispatch(src: &str, |
| 195 | + filemap_start_pos: BytePos, |
| 196 | + lines: &mut Vec<BytePos>, |
| 197 | + multi_byte_chars: &mut Vec<MultiByteChar>, |
| 198 | + non_narrow_chars: &mut Vec<NonNarrowChar>) { |
| 199 | + analyze_filemap_generic(src, |
| 200 | + src.len(), |
| 201 | + filemap_start_pos, |
| 202 | + lines, |
| 203 | + multi_byte_chars, |
| 204 | + non_narrow_chars); |
| 205 | + } |
| 206 | + } |
| 207 | +} |
| 208 | + |
| 209 | +// `scan_len` determines the number of bytes in `src` to scan. Note that the |
| 210 | +// function can read past `scan_len` if a multi-byte character start within the |
| 211 | +// range but extends past it. The overflow is returned by the function. |
| 212 | +fn analyze_filemap_generic(src: &str, |
| 213 | + scan_len: usize, |
| 214 | + output_offset: BytePos, |
| 215 | + lines: &mut Vec<BytePos>, |
| 216 | + multi_byte_chars: &mut Vec<MultiByteChar>, |
| 217 | + non_narrow_chars: &mut Vec<NonNarrowChar>) |
| 218 | + -> usize |
| 219 | +{ |
| 220 | + assert!(src.len() >= scan_len); |
| 221 | + let mut i = 0; |
| 222 | + let src_bytes = src.as_bytes(); |
| 223 | + |
| 224 | + while i < scan_len { |
| 225 | + let byte = unsafe { |
| 226 | + // We verified that i < scan_len <= src.len() |
| 227 | + *src_bytes.get_unchecked(i as usize) |
| 228 | + }; |
| 229 | + |
| 230 | + // How much to advance in order to get to the next UTF-8 char in the |
| 231 | + // string. |
| 232 | + let mut char_len = 1; |
| 233 | + |
| 234 | + if byte < 32 { |
| 235 | + // This is an ASCII control character, it could be one of the cases |
| 236 | + // that are interesting to us. |
| 237 | + |
| 238 | + let pos = BytePos::from_usize(i) + output_offset; |
| 239 | + |
| 240 | + match byte { |
| 241 | + b'\n' => { |
| 242 | + lines.push(pos + BytePos(1)); |
| 243 | + } |
| 244 | + b'\t' => { |
| 245 | + non_narrow_chars.push(NonNarrowChar::Tab(pos)); |
| 246 | + } |
| 247 | + _ => { |
| 248 | + non_narrow_chars.push(NonNarrowChar::ZeroWidth(pos)); |
| 249 | + } |
| 250 | + } |
| 251 | + } else if byte >= 127 { |
| 252 | + // The slow path: |
| 253 | + // This is either ASCII control character "DEL" or the beginning of |
| 254 | + // a multibyte char. Just decode to `char`. |
| 255 | + let c = (&src[i..]).chars().next().unwrap(); |
| 256 | + char_len = c.len_utf8(); |
| 257 | + |
| 258 | + let pos = BytePos::from_usize(i) + output_offset; |
| 259 | + |
| 260 | + if char_len > 1 { |
| 261 | + assert!(char_len >=2 && char_len <= 4); |
| 262 | + let mbc = MultiByteChar { |
| 263 | + pos, |
| 264 | + bytes: char_len as u32, |
| 265 | + }; |
| 266 | + multi_byte_chars.push(mbc); |
| 267 | + } |
| 268 | + |
| 269 | + // Assume control characters are zero width. |
| 270 | + // FIXME: How can we decide between `width` and `width_cjk`? |
| 271 | + let char_width = UnicodeWidthChar::width(c).unwrap_or(0); |
| 272 | + |
| 273 | + if char_width != 1 { |
| 274 | + non_narrow_chars.push(NonNarrowChar::new(pos, char_width)); |
| 275 | + } |
| 276 | + } |
| 277 | + |
| 278 | + i += char_len; |
| 279 | + } |
| 280 | + |
| 281 | + i - scan_len |
| 282 | +} |
| 283 | + |
| 284 | + |
| 285 | + |
| 286 | +macro_rules! test { |
| 287 | + (case: $test_name:ident, |
| 288 | + text: $text:expr, |
| 289 | + filemap_start_pos: $filemap_start_pos:expr, |
| 290 | + lines: $lines:expr, |
| 291 | + multi_byte_chars: $multi_byte_chars:expr, |
| 292 | + non_narrow_chars: $non_narrow_chars:expr,) => ( |
| 293 | + |
| 294 | + #[test] |
| 295 | + fn $test_name() { |
| 296 | + |
| 297 | + let (lines, multi_byte_chars, non_narrow_chars) = |
| 298 | + analyze_filemap($text, BytePos($filemap_start_pos)); |
| 299 | + |
| 300 | + let expected_lines: Vec<BytePos> = $lines |
| 301 | + .into_iter() |
| 302 | + .map(|pos| BytePos(pos)) |
| 303 | + .collect(); |
| 304 | + |
| 305 | + assert_eq!(lines, expected_lines); |
| 306 | + |
| 307 | + let expected_mbcs: Vec<MultiByteChar> = $multi_byte_chars |
| 308 | + .into_iter() |
| 309 | + .map(|(pos, bytes)| MultiByteChar { |
| 310 | + pos: BytePos(pos), |
| 311 | + bytes, |
| 312 | + }) |
| 313 | + .collect(); |
| 314 | + |
| 315 | + assert_eq!(multi_byte_chars, expected_mbcs); |
| 316 | + |
| 317 | + let expected_nncs: Vec<NonNarrowChar> = $non_narrow_chars |
| 318 | + .into_iter() |
| 319 | + .map(|(pos, width)| { |
| 320 | + NonNarrowChar::new(BytePos(pos), width) |
| 321 | + }) |
| 322 | + .collect(); |
| 323 | + |
| 324 | + assert_eq!(non_narrow_chars, expected_nncs); |
| 325 | + }) |
| 326 | +} |
| 327 | + |
| 328 | +test!( |
| 329 | + case: empty_text, |
| 330 | + text: "", |
| 331 | + filemap_start_pos: 0, |
| 332 | + lines: vec![], |
| 333 | + multi_byte_chars: vec![], |
| 334 | + non_narrow_chars: vec![], |
| 335 | +); |
| 336 | + |
| 337 | +test!( |
| 338 | + case: newlines_short, |
| 339 | + text: "a\nc", |
| 340 | + filemap_start_pos: 0, |
| 341 | + lines: vec![0, 2], |
| 342 | + multi_byte_chars: vec![], |
| 343 | + non_narrow_chars: vec![], |
| 344 | +); |
| 345 | + |
| 346 | +test!( |
| 347 | + case: newlines_long, |
| 348 | + text: "012345678\nabcdef012345678\na", |
| 349 | + filemap_start_pos: 0, |
| 350 | + lines: vec![0, 10, 26], |
| 351 | + multi_byte_chars: vec![], |
| 352 | + non_narrow_chars: vec![], |
| 353 | +); |
| 354 | + |
| 355 | +test!( |
| 356 | + case: newline_and_multi_byte_char_in_same_chunk, |
| 357 | + text: "01234β789\nbcdef0123456789abcdef", |
| 358 | + filemap_start_pos: 0, |
| 359 | + lines: vec![0, 11], |
| 360 | + multi_byte_chars: vec![(5, 2)], |
| 361 | + non_narrow_chars: vec![], |
| 362 | +); |
| 363 | + |
| 364 | +test!( |
| 365 | + case: newline_and_control_char_in_same_chunk, |
| 366 | + text: "01234\u{07}6789\nbcdef0123456789abcdef", |
| 367 | + filemap_start_pos: 0, |
| 368 | + lines: vec![0, 11], |
| 369 | + multi_byte_chars: vec![], |
| 370 | + non_narrow_chars: vec![(5, 0)], |
| 371 | +); |
| 372 | + |
| 373 | +test!( |
| 374 | + case: multi_byte_char_short, |
| 375 | + text: "aβc", |
| 376 | + filemap_start_pos: 0, |
| 377 | + lines: vec![0], |
| 378 | + multi_byte_chars: vec![(1, 2)], |
| 379 | + non_narrow_chars: vec![], |
| 380 | +); |
| 381 | + |
| 382 | +test!( |
| 383 | + case: multi_byte_char_long, |
| 384 | + text: "0123456789abcΔf012345β", |
| 385 | + filemap_start_pos: 0, |
| 386 | + lines: vec![0], |
| 387 | + multi_byte_chars: vec![(13, 2), (22, 2)], |
| 388 | + non_narrow_chars: vec![], |
| 389 | +); |
| 390 | + |
| 391 | +test!( |
| 392 | + case: multi_byte_char_across_chunk_boundary, |
| 393 | + text: "0123456789abcdeΔ123456789abcdef01234", |
| 394 | + filemap_start_pos: 0, |
| 395 | + lines: vec![0], |
| 396 | + multi_byte_chars: vec![(15, 2)], |
| 397 | + non_narrow_chars: vec![], |
| 398 | +); |
| 399 | + |
| 400 | +test!( |
| 401 | + case: multi_byte_char_across_chunk_boundary_tail, |
| 402 | + text: "0123456789abcdeΔ....", |
| 403 | + filemap_start_pos: 0, |
| 404 | + lines: vec![0], |
| 405 | + multi_byte_chars: vec![(15, 2)], |
| 406 | + non_narrow_chars: vec![], |
| 407 | +); |
| 408 | + |
| 409 | +test!( |
| 410 | + case: non_narrow_short, |
| 411 | + text: "0\t2", |
| 412 | + filemap_start_pos: 0, |
| 413 | + lines: vec![0], |
| 414 | + multi_byte_chars: vec![], |
| 415 | + non_narrow_chars: vec![(1, 4)], |
| 416 | +); |
| 417 | + |
| 418 | +test!( |
| 419 | + case: non_narrow_long, |
| 420 | + text: "01\t3456789abcdef01234567\u{07}9", |
| 421 | + filemap_start_pos: 0, |
| 422 | + lines: vec![0], |
| 423 | + multi_byte_chars: vec![], |
| 424 | + non_narrow_chars: vec![(2, 4), (24, 0)], |
| 425 | +); |
| 426 | + |
| 427 | +test!( |
| 428 | + case: output_offset_all, |
| 429 | + text: "01\t345\n789abcΔf01234567\u{07}9\nbcΔf", |
| 430 | + filemap_start_pos: 1000, |
| 431 | + lines: vec![0 + 1000, 7 + 1000, 27 + 1000], |
| 432 | + multi_byte_chars: vec![(13 + 1000, 2), (29 + 1000, 2)], |
| 433 | + non_narrow_chars: vec![(2 + 1000, 4), (24 + 1000, 0)], |
| 434 | +); |
0 commit comments