polars_io/csv/read/
splitfields.rs

1#![allow(unsafe_op_in_unsafe_fn)]
2#[cfg(not(feature = "simd"))]
3mod inner {
4    /// An adapted version of std::iter::Split.
5    /// This exists solely because we cannot split the lines naively as
6    pub(crate) struct SplitFields<'a> {
7        v: &'a [u8],
8        separator: u8,
9        finished: bool,
10        quote_char: u8,
11        quoting: bool,
12        eol_char: u8,
13    }
14
15    impl<'a> SplitFields<'a> {
16        pub(crate) fn new(
17            slice: &'a [u8],
18            separator: u8,
19            quote_char: Option<u8>,
20            eol_char: u8,
21        ) -> Self {
22            Self {
23                v: slice,
24                separator,
25                finished: false,
26                quote_char: quote_char.unwrap_or(b'"'),
27                quoting: quote_char.is_some(),
28                eol_char,
29            }
30        }
31
32        unsafe fn finish_eol(
33            &mut self,
34            need_escaping: bool,
35            idx: usize,
36        ) -> Option<(&'a [u8], bool)> {
37            self.finished = true;
38            debug_assert!(idx <= self.v.len());
39            Some((self.v.get_unchecked(..idx), need_escaping))
40        }
41
42        fn finish(&mut self, need_escaping: bool) -> Option<(&'a [u8], bool)> {
43            self.finished = true;
44            Some((self.v, need_escaping))
45        }
46
47        fn eof_oel(&self, current_ch: u8) -> bool {
48            current_ch == self.separator || current_ch == self.eol_char
49        }
50    }
51
52    impl<'a> Iterator for SplitFields<'a> {
53        // the bool is used to indicate that it requires escaping
54        type Item = (&'a [u8], bool);
55
56        #[inline]
57        fn next(&mut self) -> Option<(&'a [u8], bool)> {
58            if self.finished {
59                return None;
60            } else if self.v.is_empty() {
61                return self.finish(false);
62            }
63
64            let mut needs_escaping = false;
65            // There can be strings with separators:
66            // "Street, City",
67
68            // SAFETY:
69            // we have checked bounds
70            let pos = if self.quoting && unsafe { *self.v.get_unchecked(0) } == self.quote_char {
71                needs_escaping = true;
72                // There can be pair of double-quotes within string.
73                // Each of the embedded double-quote characters must be represented
74                // by a pair of double-quote characters:
75                // e.g. 1997,Ford,E350,"Super, ""luxurious"" truck",20020
76
77                // denotes if we are in a string field, started with a quote
78                let mut in_field = false;
79
80                let mut idx = 0u32;
81                let mut current_idx = 0u32;
82                // micro optimizations
83                #[allow(clippy::explicit_counter_loop)]
84                for &c in self.v.iter() {
85                    if c == self.quote_char {
86                        // toggle between string field enclosure
87                        //      if we encounter a starting '"' -> in_field = true;
88                        //      if we encounter a closing '"' -> in_field = false;
89                        in_field = !in_field;
90                    }
91
92                    if !in_field && self.eof_oel(c) {
93                        if c == self.eol_char {
94                            // SAFETY:
95                            // we are in bounds
96                            return unsafe {
97                                self.finish_eol(needs_escaping, current_idx as usize)
98                            };
99                        }
100                        idx = current_idx;
101                        break;
102                    }
103                    current_idx += 1;
104                }
105
106                if idx == 0 {
107                    return self.finish(needs_escaping);
108                }
109
110                idx as usize
111            } else {
112                match self.v.iter().position(|&c| self.eof_oel(c)) {
113                    None => return self.finish(needs_escaping),
114                    Some(idx) => unsafe {
115                        // SAFETY:
116                        // idx was just found
117                        if *self.v.get_unchecked(idx) == self.eol_char {
118                            return self.finish_eol(needs_escaping, idx);
119                        } else {
120                            idx
121                        }
122                    },
123                }
124            };
125
126            unsafe {
127                debug_assert!(pos <= self.v.len());
128                // SAFETY:
129                // we are in bounds
130                let ret = Some((self.v.get_unchecked(..pos), needs_escaping));
131                self.v = self.v.get_unchecked(pos + 1..);
132                ret
133            }
134        }
135    }
136}
137
138#[cfg(feature = "simd")]
139mod inner {
140    use std::simd::prelude::*;
141
142    use polars_utils::clmul::prefix_xorsum_inclusive;
143
144    const SIMD_SIZE: usize = 64;
145    type SimdVec = u8x64;
146
147    /// An adapted version of std::iter::Split.
148    /// This exists solely because we cannot split the lines naively as
149    pub(crate) struct SplitFields<'a> {
150        pub v: &'a [u8],
151        separator: u8,
152        pub finished: bool,
153        quote_char: u8,
154        quoting: bool,
155        eol_char: u8,
156        simd_separator: SimdVec,
157        simd_eol_char: SimdVec,
158        simd_quote_char: SimdVec,
159        previous_valid_ends: u64,
160    }
161
162    impl<'a> SplitFields<'a> {
163        pub(crate) fn new(
164            slice: &'a [u8],
165            separator: u8,
166            quote_char: Option<u8>,
167            eol_char: u8,
168        ) -> Self {
169            let simd_separator = SimdVec::splat(separator);
170            let simd_eol_char = SimdVec::splat(eol_char);
171            let quoting = quote_char.is_some();
172            let quote_char = quote_char.unwrap_or(b'"');
173            let simd_quote_char = SimdVec::splat(quote_char);
174
175            Self {
176                v: slice,
177                separator,
178                finished: false,
179                quote_char,
180                quoting,
181                eol_char,
182                simd_separator,
183                simd_eol_char,
184                simd_quote_char,
185                previous_valid_ends: 0,
186            }
187        }
188
189        unsafe fn finish_eol(
190            &mut self,
191            need_escaping: bool,
192            pos: usize,
193        ) -> Option<(&'a [u8], bool)> {
194            self.finished = true;
195            debug_assert!(pos <= self.v.len());
196            Some((self.v.get_unchecked(..pos), need_escaping))
197        }
198
199        #[inline]
200        fn finish(&mut self, need_escaping: bool) -> Option<(&'a [u8], bool)> {
201            self.finished = true;
202            Some((self.v, need_escaping))
203        }
204
205        fn eof_oel(&self, current_ch: u8) -> bool {
206            current_ch == self.separator || current_ch == self.eol_char
207        }
208    }
209
210    impl<'a> Iterator for SplitFields<'a> {
211        // the bool is used to indicate that it requires escaping
212        type Item = (&'a [u8], bool);
213
214        #[inline]
215        fn next(&mut self) -> Option<(&'a [u8], bool)> {
216            // This must be before we check the cached value
217            if self.finished {
218                return None;
219            }
220            // Then check cached value as this is hot.
221            if self.previous_valid_ends != 0 {
222                let pos = self.previous_valid_ends.trailing_zeros() as usize;
223                self.previous_valid_ends >>= (pos + 1) as u64;
224
225                unsafe {
226                    debug_assert!(pos < self.v.len());
227                    // SAFETY:
228                    // we are in bounds
229                    let needs_escaping = self
230                        .v
231                        .first()
232                        .map(|c| *c == self.quote_char && self.quoting)
233                        .unwrap_or(false);
234
235                    if *self.v.get_unchecked(pos) == self.eol_char {
236                        return self.finish_eol(needs_escaping, pos);
237                    }
238
239                    let bytes = self.v.get_unchecked(..pos);
240
241                    self.v = self.v.get_unchecked(pos + 1..);
242                    let ret = Some((bytes, needs_escaping));
243
244                    return ret;
245                }
246            }
247            if self.v.is_empty() {
248                return self.finish(false);
249            }
250
251            let mut needs_escaping = false;
252            // There can be strings with separators:
253            // "Street, City",
254
255            // SAFETY:
256            // we have checked bounds
257            let pos = if self.quoting && unsafe { *self.v.get_unchecked(0) } == self.quote_char {
258                let mut total_idx = 0;
259                needs_escaping = true;
260                let mut not_in_field_previous_iter = true;
261
262                loop {
263                    let bytes = unsafe { self.v.get_unchecked(total_idx..) };
264
265                    if bytes.len() > SIMD_SIZE {
266                        let lane: [u8; SIMD_SIZE] = unsafe {
267                            bytes
268                                .get_unchecked(0..SIMD_SIZE)
269                                .try_into()
270                                .unwrap_unchecked()
271                        };
272                        let simd_bytes = SimdVec::from(lane);
273                        let has_eol = simd_bytes.simd_eq(self.simd_eol_char);
274                        let has_sep = simd_bytes.simd_eq(self.simd_separator);
275                        let quote_mask = simd_bytes.simd_eq(self.simd_quote_char).to_bitmask();
276                        let mut end_mask = (has_sep | has_eol).to_bitmask();
277
278                        let mut not_in_quote_field = prefix_xorsum_inclusive(quote_mask);
279
280                        if not_in_field_previous_iter {
281                            not_in_quote_field = !not_in_quote_field;
282                        }
283                        not_in_field_previous_iter =
284                            (not_in_quote_field & (1 << (SIMD_SIZE - 1))) > 0;
285                        end_mask &= not_in_quote_field;
286
287                        if end_mask != 0 {
288                            let pos = end_mask.trailing_zeros() as usize;
289                            total_idx += pos;
290                            debug_assert!(
291                                self.v[total_idx] == self.eol_char
292                                    || self.v[total_idx] == self.separator
293                            );
294
295                            if pos == SIMD_SIZE - 1 {
296                                self.previous_valid_ends = 0;
297                            } else {
298                                self.previous_valid_ends = end_mask >> (pos + 1) as u64;
299                            }
300
301                            break;
302                        } else {
303                            total_idx += SIMD_SIZE;
304                        }
305                    } else {
306                        // There can be a pair of double-quotes within a string.
307                        // Each of the embedded double-quote characters must be represented
308                        // by a pair of double-quote characters:
309                        // e.g. 1997,Ford,E350,"Super, ""luxurious"" truck",20020
310
311                        // denotes if we are in a string field, started with a quote
312                        let mut in_field = !not_in_field_previous_iter;
313
314                        // usize::MAX is unset.
315                        let mut idx = usize::MAX;
316                        let mut current_idx = 0;
317                        // micro optimizations
318                        #[allow(clippy::explicit_counter_loop)]
319                        for &c in bytes.iter() {
320                            if c == self.quote_char {
321                                // toggle between string field enclosure
322                                //      if we encounter a starting '"' -> in_field = true;
323                                //      if we encounter a closing '"' -> in_field = false;
324                                in_field = !in_field;
325                            }
326
327                            if !in_field && self.eof_oel(c) {
328                                if c == self.eol_char {
329                                    // SAFETY:
330                                    // we are in bounds
331                                    return unsafe {
332                                        self.finish_eol(needs_escaping, current_idx + total_idx)
333                                    };
334                                }
335                                idx = current_idx;
336                                break;
337                            }
338                            current_idx += 1;
339                        }
340
341                        if idx == usize::MAX {
342                            return self.finish(needs_escaping);
343                        }
344
345                        total_idx += idx;
346                        debug_assert!(
347                            self.v[total_idx] == self.eol_char
348                                || self.v[total_idx] == self.separator
349                        );
350                        break;
351                    }
352                }
353                total_idx
354            } else {
355                let mut total_idx = 0;
356
357                loop {
358                    let bytes = unsafe { self.v.get_unchecked(total_idx..) };
359
360                    if bytes.len() > SIMD_SIZE {
361                        let lane: [u8; SIMD_SIZE] = unsafe {
362                            bytes
363                                .get_unchecked(0..SIMD_SIZE)
364                                .try_into()
365                                .unwrap_unchecked()
366                        };
367                        let simd_bytes = SimdVec::from(lane);
368                        let has_eol_char = simd_bytes.simd_eq(self.simd_eol_char);
369                        let has_separator = simd_bytes.simd_eq(self.simd_separator);
370                        let has_any_mask = (has_separator | has_eol_char).to_bitmask();
371
372                        if has_any_mask != 0 {
373                            total_idx += has_any_mask.trailing_zeros() as usize;
374                            break;
375                        } else {
376                            total_idx += SIMD_SIZE;
377                        }
378                    } else {
379                        match bytes.iter().position(|&c| self.eof_oel(c)) {
380                            None => return self.finish(needs_escaping),
381                            Some(idx) => {
382                                total_idx += idx;
383                                break;
384                            },
385                        }
386                    }
387                }
388                unsafe {
389                    if *self.v.get_unchecked(total_idx) == self.eol_char {
390                        return self.finish_eol(needs_escaping, total_idx);
391                    } else {
392                        total_idx
393                    }
394                }
395            };
396
397            unsafe {
398                debug_assert!(pos < self.v.len());
399                // SAFETY:
400                // we are in bounds
401                let ret = Some((self.v.get_unchecked(..pos), needs_escaping));
402                self.v = self.v.get_unchecked(pos + 1..);
403                ret
404            }
405        }
406    }
407}
408
409pub(crate) use inner::SplitFields;
410
411#[cfg(test)]
412mod test {
413    use super::SplitFields;
414
415    #[test]
416    fn test_splitfields() {
417        let input = "\"foo\",\"bar\"";
418        let mut fields = SplitFields::new(input.as_bytes(), b',', Some(b'"'), b'\n');
419
420        assert_eq!(fields.next(), Some(("\"foo\"".as_bytes(), true)));
421        assert_eq!(fields.next(), Some(("\"bar\"".as_bytes(), true)));
422        assert_eq!(fields.next(), None);
423
424        let input2 = "\"foo\n bar\";\"baz\";12345";
425        let mut fields2 = SplitFields::new(input2.as_bytes(), b';', Some(b'"'), b'\n');
426
427        assert_eq!(fields2.next(), Some(("\"foo\n bar\"".as_bytes(), true)));
428        assert_eq!(fields2.next(), Some(("\"baz\"".as_bytes(), true)));
429        assert_eq!(fields2.next(), Some(("12345".as_bytes(), false)));
430        assert_eq!(fields2.next(), None);
431    }
432}