1#![allow(unsafe_op_in_unsafe_fn)]
2#[cfg(not(feature = "simd"))]
3mod inner {
4 pub(crate) struct SplitFields<'a> {
7 v: &'a [u8],
8 separator: u8,
9 finished: bool,
10 quote_char: u8,
11 quoting: bool,
12 eol_char: u8,
13 }
14
15 impl<'a> SplitFields<'a> {
16 pub(crate) fn new(
17 slice: &'a [u8],
18 separator: u8,
19 quote_char: Option<u8>,
20 eol_char: u8,
21 ) -> Self {
22 Self {
23 v: slice,
24 separator,
25 finished: false,
26 quote_char: quote_char.unwrap_or(b'"'),
27 quoting: quote_char.is_some(),
28 eol_char,
29 }
30 }
31
32 unsafe fn finish_eol(
33 &mut self,
34 need_escaping: bool,
35 idx: usize,
36 ) -> Option<(&'a [u8], bool)> {
37 self.finished = true;
38 debug_assert!(idx <= self.v.len());
39 Some((self.v.get_unchecked(..idx), need_escaping))
40 }
41
42 fn finish(&mut self, need_escaping: bool) -> Option<(&'a [u8], bool)> {
43 self.finished = true;
44 Some((self.v, need_escaping))
45 }
46
47 fn eof_oel(&self, current_ch: u8) -> bool {
48 current_ch == self.separator || current_ch == self.eol_char
49 }
50 }
51
52 impl<'a> Iterator for SplitFields<'a> {
53 type Item = (&'a [u8], bool);
55
56 #[inline]
57 fn next(&mut self) -> Option<(&'a [u8], bool)> {
58 if self.finished {
59 return None;
60 } else if self.v.is_empty() {
61 return self.finish(false);
62 }
63
64 let mut needs_escaping = false;
65 let pos = if self.quoting && unsafe { *self.v.get_unchecked(0) } == self.quote_char {
71 needs_escaping = true;
72 let mut in_field = false;
79
80 let mut idx = 0u32;
81 let mut current_idx = 0u32;
82 #[allow(clippy::explicit_counter_loop)]
84 for &c in self.v.iter() {
85 if c == self.quote_char {
86 in_field = !in_field;
90 }
91
92 if !in_field && self.eof_oel(c) {
93 if c == self.eol_char {
94 return unsafe {
97 self.finish_eol(needs_escaping, current_idx as usize)
98 };
99 }
100 idx = current_idx;
101 break;
102 }
103 current_idx += 1;
104 }
105
106 if idx == 0 {
107 return self.finish(needs_escaping);
108 }
109
110 idx as usize
111 } else {
112 match self.v.iter().position(|&c| self.eof_oel(c)) {
113 None => return self.finish(needs_escaping),
114 Some(idx) => unsafe {
115 if *self.v.get_unchecked(idx) == self.eol_char {
118 return self.finish_eol(needs_escaping, idx);
119 } else {
120 idx
121 }
122 },
123 }
124 };
125
126 unsafe {
127 debug_assert!(pos <= self.v.len());
128 let ret = Some((self.v.get_unchecked(..pos), needs_escaping));
131 self.v = self.v.get_unchecked(pos + 1..);
132 ret
133 }
134 }
135 }
136}
137
138#[cfg(feature = "simd")]
139mod inner {
140 use std::simd::prelude::*;
141
142 use polars_utils::clmul::prefix_xorsum_inclusive;
143
144 const SIMD_SIZE: usize = 64;
145 type SimdVec = u8x64;
146
147 pub(crate) struct SplitFields<'a> {
150 pub v: &'a [u8],
151 separator: u8,
152 pub finished: bool,
153 quote_char: u8,
154 quoting: bool,
155 eol_char: u8,
156 simd_separator: SimdVec,
157 simd_eol_char: SimdVec,
158 simd_quote_char: SimdVec,
159 previous_valid_ends: u64,
160 }
161
162 impl<'a> SplitFields<'a> {
163 pub(crate) fn new(
164 slice: &'a [u8],
165 separator: u8,
166 quote_char: Option<u8>,
167 eol_char: u8,
168 ) -> Self {
169 let simd_separator = SimdVec::splat(separator);
170 let simd_eol_char = SimdVec::splat(eol_char);
171 let quoting = quote_char.is_some();
172 let quote_char = quote_char.unwrap_or(b'"');
173 let simd_quote_char = SimdVec::splat(quote_char);
174
175 Self {
176 v: slice,
177 separator,
178 finished: false,
179 quote_char,
180 quoting,
181 eol_char,
182 simd_separator,
183 simd_eol_char,
184 simd_quote_char,
185 previous_valid_ends: 0,
186 }
187 }
188
189 unsafe fn finish_eol(
190 &mut self,
191 need_escaping: bool,
192 pos: usize,
193 ) -> Option<(&'a [u8], bool)> {
194 self.finished = true;
195 debug_assert!(pos <= self.v.len());
196 Some((self.v.get_unchecked(..pos), need_escaping))
197 }
198
199 #[inline]
200 fn finish(&mut self, need_escaping: bool) -> Option<(&'a [u8], bool)> {
201 self.finished = true;
202 Some((self.v, need_escaping))
203 }
204
205 fn eof_oel(&self, current_ch: u8) -> bool {
206 current_ch == self.separator || current_ch == self.eol_char
207 }
208 }
209
210 impl<'a> Iterator for SplitFields<'a> {
211 type Item = (&'a [u8], bool);
213
214 #[inline]
215 fn next(&mut self) -> Option<(&'a [u8], bool)> {
216 if self.finished {
218 return None;
219 }
220 if self.previous_valid_ends != 0 {
222 let pos = self.previous_valid_ends.trailing_zeros() as usize;
223 self.previous_valid_ends >>= (pos + 1) as u64;
224
225 unsafe {
226 debug_assert!(pos < self.v.len());
227 let needs_escaping = self
230 .v
231 .first()
232 .map(|c| *c == self.quote_char && self.quoting)
233 .unwrap_or(false);
234
235 if *self.v.get_unchecked(pos) == self.eol_char {
236 return self.finish_eol(needs_escaping, pos);
237 }
238
239 let bytes = self.v.get_unchecked(..pos);
240
241 self.v = self.v.get_unchecked(pos + 1..);
242 let ret = Some((bytes, needs_escaping));
243
244 return ret;
245 }
246 }
247 if self.v.is_empty() {
248 return self.finish(false);
249 }
250
251 let mut needs_escaping = false;
252 let pos = if self.quoting && unsafe { *self.v.get_unchecked(0) } == self.quote_char {
258 let mut total_idx = 0;
259 needs_escaping = true;
260 let mut not_in_field_previous_iter = true;
261
262 loop {
263 let bytes = unsafe { self.v.get_unchecked(total_idx..) };
264
265 if bytes.len() > SIMD_SIZE {
266 let lane: [u8; SIMD_SIZE] = unsafe {
267 bytes
268 .get_unchecked(0..SIMD_SIZE)
269 .try_into()
270 .unwrap_unchecked()
271 };
272 let simd_bytes = SimdVec::from(lane);
273 let has_eol = simd_bytes.simd_eq(self.simd_eol_char);
274 let has_sep = simd_bytes.simd_eq(self.simd_separator);
275 let quote_mask = simd_bytes.simd_eq(self.simd_quote_char).to_bitmask();
276 let mut end_mask = (has_sep | has_eol).to_bitmask();
277
278 let mut not_in_quote_field = prefix_xorsum_inclusive(quote_mask);
279
280 if not_in_field_previous_iter {
281 not_in_quote_field = !not_in_quote_field;
282 }
283 not_in_field_previous_iter =
284 (not_in_quote_field & (1 << (SIMD_SIZE - 1))) > 0;
285 end_mask &= not_in_quote_field;
286
287 if end_mask != 0 {
288 let pos = end_mask.trailing_zeros() as usize;
289 total_idx += pos;
290 debug_assert!(
291 self.v[total_idx] == self.eol_char
292 || self.v[total_idx] == self.separator
293 );
294
295 if pos == SIMD_SIZE - 1 {
296 self.previous_valid_ends = 0;
297 } else {
298 self.previous_valid_ends = end_mask >> (pos + 1) as u64;
299 }
300
301 break;
302 } else {
303 total_idx += SIMD_SIZE;
304 }
305 } else {
306 let mut in_field = !not_in_field_previous_iter;
313
314 let mut idx = usize::MAX;
316 let mut current_idx = 0;
317 #[allow(clippy::explicit_counter_loop)]
319 for &c in bytes.iter() {
320 if c == self.quote_char {
321 in_field = !in_field;
325 }
326
327 if !in_field && self.eof_oel(c) {
328 if c == self.eol_char {
329 return unsafe {
332 self.finish_eol(needs_escaping, current_idx + total_idx)
333 };
334 }
335 idx = current_idx;
336 break;
337 }
338 current_idx += 1;
339 }
340
341 if idx == usize::MAX {
342 return self.finish(needs_escaping);
343 }
344
345 total_idx += idx;
346 debug_assert!(
347 self.v[total_idx] == self.eol_char
348 || self.v[total_idx] == self.separator
349 );
350 break;
351 }
352 }
353 total_idx
354 } else {
355 let mut total_idx = 0;
356
357 loop {
358 let bytes = unsafe { self.v.get_unchecked(total_idx..) };
359
360 if bytes.len() > SIMD_SIZE {
361 let lane: [u8; SIMD_SIZE] = unsafe {
362 bytes
363 .get_unchecked(0..SIMD_SIZE)
364 .try_into()
365 .unwrap_unchecked()
366 };
367 let simd_bytes = SimdVec::from(lane);
368 let has_eol_char = simd_bytes.simd_eq(self.simd_eol_char);
369 let has_separator = simd_bytes.simd_eq(self.simd_separator);
370 let has_any_mask = (has_separator | has_eol_char).to_bitmask();
371
372 if has_any_mask != 0 {
373 total_idx += has_any_mask.trailing_zeros() as usize;
374 break;
375 } else {
376 total_idx += SIMD_SIZE;
377 }
378 } else {
379 match bytes.iter().position(|&c| self.eof_oel(c)) {
380 None => return self.finish(needs_escaping),
381 Some(idx) => {
382 total_idx += idx;
383 break;
384 },
385 }
386 }
387 }
388 unsafe {
389 if *self.v.get_unchecked(total_idx) == self.eol_char {
390 return self.finish_eol(needs_escaping, total_idx);
391 } else {
392 total_idx
393 }
394 }
395 };
396
397 unsafe {
398 debug_assert!(pos < self.v.len());
399 let ret = Some((self.v.get_unchecked(..pos), needs_escaping));
402 self.v = self.v.get_unchecked(pos + 1..);
403 ret
404 }
405 }
406 }
407}
408
409pub(crate) use inner::SplitFields;
410
411#[cfg(test)]
412mod test {
413 use super::SplitFields;
414
415 #[test]
416 fn test_splitfields() {
417 let input = "\"foo\",\"bar\"";
418 let mut fields = SplitFields::new(input.as_bytes(), b',', Some(b'"'), b'\n');
419
420 assert_eq!(fields.next(), Some(("\"foo\"".as_bytes(), true)));
421 assert_eq!(fields.next(), Some(("\"bar\"".as_bytes(), true)));
422 assert_eq!(fields.next(), None);
423
424 let input2 = "\"foo\n bar\";\"baz\";12345";
425 let mut fields2 = SplitFields::new(input2.as_bytes(), b';', Some(b'"'), b'\n');
426
427 assert_eq!(fields2.next(), Some(("\"foo\n bar\"".as_bytes(), true)));
428 assert_eq!(fields2.next(), Some(("\"baz\"".as_bytes(), true)));
429 assert_eq!(fields2.next(), Some(("12345".as_bytes(), false)));
430 assert_eq!(fields2.next(), None);
431 }
432}