polars_io/csv/read/
buffer.rs

1use arrow::array::MutableBinaryViewArray;
2#[cfg(feature = "dtype-categorical")]
3use polars_core::chunked_array::builder::CategoricalChunkedBuilder;
4use polars_core::prelude::*;
5use polars_error::to_compute_err;
6#[cfg(any(feature = "dtype-datetime", feature = "dtype-date"))]
7use polars_time::chunkedarray::string::Pattern;
8#[cfg(any(feature = "dtype-datetime", feature = "dtype-date"))]
9use polars_time::prelude::string::infer::{
10    DatetimeInfer, StrpTimeParser, TryFromWithUnit, infer_pattern_single,
11};
12use polars_utils::vec::PushUnchecked;
13
14use super::options::CsvEncoding;
15use super::parser::{is_whitespace, skip_whitespace};
16use super::utils::escape_field;
17
18pub(crate) trait PrimitiveParser: PolarsNumericType {
19    fn parse(bytes: &[u8]) -> Option<Self::Native>;
20}
21
22impl PrimitiveParser for Float32Type {
23    #[inline]
24    fn parse(bytes: &[u8]) -> Option<f32> {
25        fast_float2::parse(bytes).ok()
26    }
27}
28impl PrimitiveParser for Float64Type {
29    #[inline]
30    fn parse(bytes: &[u8]) -> Option<f64> {
31        fast_float2::parse(bytes).ok()
32    }
33}
34
35#[cfg(feature = "dtype-u8")]
36impl PrimitiveParser for UInt8Type {
37    #[inline]
38    fn parse(bytes: &[u8]) -> Option<u8> {
39        atoi_simd::parse_skipped(bytes).ok()
40    }
41}
42#[cfg(feature = "dtype-u16")]
43impl PrimitiveParser for UInt16Type {
44    #[inline]
45    fn parse(bytes: &[u8]) -> Option<u16> {
46        atoi_simd::parse_skipped(bytes).ok()
47    }
48}
49impl PrimitiveParser for UInt32Type {
50    #[inline]
51    fn parse(bytes: &[u8]) -> Option<u32> {
52        atoi_simd::parse_skipped(bytes).ok()
53    }
54}
55impl PrimitiveParser for UInt64Type {
56    #[inline]
57    fn parse(bytes: &[u8]) -> Option<u64> {
58        atoi_simd::parse_skipped(bytes).ok()
59    }
60}
61#[cfg(feature = "dtype-i8")]
62impl PrimitiveParser for Int8Type {
63    #[inline]
64    fn parse(bytes: &[u8]) -> Option<i8> {
65        atoi_simd::parse_skipped(bytes).ok()
66    }
67}
68#[cfg(feature = "dtype-i16")]
69impl PrimitiveParser for Int16Type {
70    #[inline]
71    fn parse(bytes: &[u8]) -> Option<i16> {
72        atoi_simd::parse_skipped(bytes).ok()
73    }
74}
75impl PrimitiveParser for Int32Type {
76    #[inline]
77    fn parse(bytes: &[u8]) -> Option<i32> {
78        atoi_simd::parse_skipped(bytes).ok()
79    }
80}
81impl PrimitiveParser for Int64Type {
82    #[inline]
83    fn parse(bytes: &[u8]) -> Option<i64> {
84        atoi_simd::parse_skipped(bytes).ok()
85    }
86}
87#[cfg(feature = "dtype-i128")]
88impl PrimitiveParser for Int128Type {
89    #[inline]
90    fn parse(bytes: &[u8]) -> Option<i128> {
91        atoi_simd::parse_skipped(bytes).ok()
92    }
93}
94
95trait ParsedBuffer {
96    fn parse_bytes(
97        &mut self,
98        bytes: &[u8],
99        ignore_errors: bool,
100        _needs_escaping: bool,
101        _missing_is_null: bool,
102        _time_unit: Option<TimeUnit>,
103    ) -> PolarsResult<()>;
104}
105
106impl<T> ParsedBuffer for PrimitiveChunkedBuilder<T>
107where
108    T: PolarsNumericType + PrimitiveParser,
109{
110    #[inline]
111    fn parse_bytes(
112        &mut self,
113        bytes: &[u8],
114        ignore_errors: bool,
115        needs_escaping: bool,
116        _missing_is_null: bool,
117        _time_unit: Option<TimeUnit>,
118    ) -> PolarsResult<()> {
119        if bytes.is_empty() {
120            self.append_null()
121        } else {
122            let bytes = if needs_escaping {
123                &bytes[1..bytes.len() - 1]
124            } else {
125                bytes
126            };
127
128            // legacy comment (remember this if you decide to use Results again):
129            // its faster to work on options.
130            // if we need to throw an error, we parse again to be able to throw the error
131
132            match T::parse(bytes) {
133                Some(value) => self.append_value(value),
134                None => {
135                    // try again without whitespace
136                    if !bytes.is_empty() && is_whitespace(bytes[0]) {
137                        let bytes = skip_whitespace(bytes);
138                        return self.parse_bytes(
139                            bytes,
140                            ignore_errors,
141                            false, // escaping was already done
142                            _missing_is_null,
143                            None,
144                        );
145                    }
146                    polars_ensure!(
147                        bytes.is_empty() || ignore_errors,
148                        ComputeError: "remaining bytes non-empty",
149                    );
150                    self.append_null()
151                },
152            };
153        }
154        Ok(())
155    }
156}
157
158pub struct Utf8Field {
159    name: PlSmallStr,
160    mutable: MutableBinaryViewArray<[u8]>,
161    scratch: Vec<u8>,
162    quote_char: u8,
163    encoding: CsvEncoding,
164}
165
166impl Utf8Field {
167    fn new(
168        name: PlSmallStr,
169        capacity: usize,
170        quote_char: Option<u8>,
171        encoding: CsvEncoding,
172    ) -> Self {
173        Self {
174            name,
175            mutable: MutableBinaryViewArray::with_capacity(capacity),
176            scratch: vec![],
177            quote_char: quote_char.unwrap_or(b'"'),
178            encoding,
179        }
180    }
181}
182
183#[inline]
184pub fn validate_utf8(bytes: &[u8]) -> bool {
185    simdutf8::basic::from_utf8(bytes).is_ok()
186}
187
188impl ParsedBuffer for Utf8Field {
189    #[inline]
190    fn parse_bytes(
191        &mut self,
192        bytes: &[u8],
193        ignore_errors: bool,
194        needs_escaping: bool,
195        missing_is_null: bool,
196        _time_unit: Option<TimeUnit>,
197    ) -> PolarsResult<()> {
198        if bytes.is_empty() {
199            if missing_is_null {
200                self.mutable.push_null()
201            } else {
202                self.mutable.push(Some([]))
203            }
204            return Ok(());
205        }
206
207        // note that one branch writes without updating the length, so we must do that later.
208        let escaped_bytes = if needs_escaping {
209            self.scratch.clear();
210            self.scratch.reserve(bytes.len());
211            polars_ensure!(bytes.len() > 1 && bytes.last() == Some(&self.quote_char), ComputeError: "invalid csv file\n\nField `{}` is not properly escaped.", std::str::from_utf8(bytes).map_err(to_compute_err)?);
212
213            // SAFETY:
214            // we just allocated enough capacity and data_len is correct.
215            unsafe {
216                let n_written =
217                    escape_field(bytes, self.quote_char, self.scratch.spare_capacity_mut());
218                self.scratch.set_len(n_written);
219            }
220
221            self.scratch.as_slice()
222        } else {
223            bytes
224        };
225
226        if matches!(self.encoding, CsvEncoding::LossyUtf8) | ignore_errors {
227            // It is important that this happens after escaping, as invalid escaped string can produce
228            // invalid utf8.
229            let parse_result = validate_utf8(escaped_bytes);
230
231            match parse_result {
232                true => {
233                    let value = escaped_bytes;
234                    self.mutable.push_value(value)
235                },
236                false => {
237                    if matches!(self.encoding, CsvEncoding::LossyUtf8) {
238                        // TODO! do this without allocating
239                        let s = String::from_utf8_lossy(escaped_bytes);
240                        self.mutable.push_value(s.as_ref().as_bytes())
241                    } else if ignore_errors {
242                        self.mutable.push_null()
243                    } else {
244                        // If field before escaping is valid utf8, the escaping is incorrect.
245                        if needs_escaping && validate_utf8(bytes) {
246                            polars_bail!(ComputeError: "string field is not properly escaped");
247                        } else {
248                            polars_bail!(ComputeError: "invalid utf-8 sequence");
249                        }
250                    }
251                },
252            }
253        } else {
254            self.mutable.push_value(escaped_bytes)
255        }
256
257        Ok(())
258    }
259}
260
261#[cfg(feature = "dtype-categorical")]
262pub struct CategoricalField<T: PolarsCategoricalType> {
263    escape_scratch: Vec<u8>,
264    quote_char: u8,
265    builder: CategoricalChunkedBuilder<T>,
266}
267
268#[cfg(feature = "dtype-categorical")]
269impl<T: PolarsCategoricalType> CategoricalField<T> {
270    fn new(name: PlSmallStr, capacity: usize, quote_char: Option<u8>, dtype: DataType) -> Self {
271        let mut builder = CategoricalChunkedBuilder::new(name, dtype);
272        builder.reserve(capacity);
273
274        Self {
275            escape_scratch: vec![],
276            quote_char: quote_char.unwrap_or(b'"'),
277            builder,
278        }
279    }
280
281    #[inline]
282    fn parse_bytes(
283        &mut self,
284        bytes: &[u8],
285        ignore_errors: bool,
286        needs_escaping: bool,
287        _missing_is_null: bool,
288        _time_unit: Option<TimeUnit>,
289    ) -> PolarsResult<()> {
290        if bytes.is_empty() {
291            self.builder.append_null();
292            return Ok(());
293        }
294        if validate_utf8(bytes) {
295            if needs_escaping {
296                polars_ensure!(bytes.len() > 1, ComputeError: "invalid csv file\n\nField `{}` is not properly escaped.", std::str::from_utf8(bytes).map_err(to_compute_err)?);
297                self.escape_scratch.clear();
298                self.escape_scratch.reserve(bytes.len());
299                // SAFETY:
300                // we just allocated enough capacity and data_len is correct.
301                unsafe {
302                    let n_written = escape_field(
303                        bytes,
304                        self.quote_char,
305                        self.escape_scratch.spare_capacity_mut(),
306                    );
307                    self.escape_scratch.set_len(n_written);
308                }
309
310                // SAFETY:
311                // just did utf8 check
312                let key = unsafe { std::str::from_utf8_unchecked(&self.escape_scratch) };
313                self.builder.append_str(key)?;
314            } else {
315                // SAFETY:
316                // just did utf8 check
317                let key = unsafe { std::str::from_utf8_unchecked(bytes) };
318                self.builder.append_str(key)?;
319            }
320        } else if ignore_errors {
321            self.builder.append_null()
322        } else {
323            polars_bail!(ComputeError: "invalid utf-8 sequence");
324        }
325        Ok(())
326    }
327}
328
329impl ParsedBuffer for BooleanChunkedBuilder {
330    #[inline]
331    fn parse_bytes(
332        &mut self,
333        bytes: &[u8],
334        ignore_errors: bool,
335        needs_escaping: bool,
336        _missing_is_null: bool,
337        _time_unit: Option<TimeUnit>,
338    ) -> PolarsResult<()> {
339        let bytes = if needs_escaping {
340            &bytes[1..bytes.len() - 1]
341        } else {
342            bytes
343        };
344        if bytes.eq_ignore_ascii_case(b"false") {
345            self.append_value(false);
346        } else if bytes.eq_ignore_ascii_case(b"true") {
347            self.append_value(true);
348        } else if ignore_errors || bytes.is_empty() {
349            self.append_null();
350        } else {
351            polars_bail!(
352                ComputeError: "error while parsing value {} as boolean",
353                String::from_utf8_lossy(bytes),
354            );
355        }
356        Ok(())
357    }
358}
359
360#[cfg(any(feature = "dtype-datetime", feature = "dtype-date"))]
361pub struct DatetimeField<T: PolarsNumericType> {
362    compiled: Option<DatetimeInfer<T>>,
363    builder: PrimitiveChunkedBuilder<T>,
364}
365
366#[cfg(any(feature = "dtype-datetime", feature = "dtype-date"))]
367impl<T: PolarsNumericType> DatetimeField<T> {
368    fn new(name: PlSmallStr, capacity: usize) -> Self {
369        let builder = PrimitiveChunkedBuilder::<T>::new(name, capacity);
370        Self {
371            compiled: None,
372            builder,
373        }
374    }
375}
376
377#[cfg(any(feature = "dtype-datetime", feature = "dtype-date"))]
378fn slow_datetime_parser<T>(
379    buf: &mut DatetimeField<T>,
380    bytes: &[u8],
381    time_unit: Option<TimeUnit>,
382    ignore_errors: bool,
383) -> PolarsResult<()>
384where
385    T: PolarsNumericType,
386    DatetimeInfer<T>: TryFromWithUnit<Pattern>,
387{
388    let val = if bytes.is_ascii() {
389        // SAFETY:
390        // we just checked it is ascii
391        unsafe { std::str::from_utf8_unchecked(bytes) }
392    } else {
393        match std::str::from_utf8(bytes) {
394            Ok(val) => val,
395            Err(_) => {
396                if ignore_errors {
397                    buf.builder.append_null();
398                    return Ok(());
399                } else {
400                    polars_bail!(ComputeError: "invalid utf-8 sequence");
401                }
402            },
403        }
404    };
405
406    let pattern = match &buf.compiled {
407        Some(compiled) => compiled.pattern,
408        None => match infer_pattern_single(val) {
409            Some(pattern) => pattern,
410            None => {
411                if ignore_errors {
412                    buf.builder.append_null();
413                    return Ok(());
414                } else {
415                    polars_bail!(ComputeError: "could not find a 'date/datetime' pattern for '{}'", val)
416                }
417            },
418        },
419    };
420    match DatetimeInfer::try_from_with_unit(pattern, time_unit) {
421        Ok(mut infer) => {
422            let parsed = infer.parse(val);
423            let Some(parsed) = parsed else {
424                if ignore_errors {
425                    buf.builder.append_null();
426                    return Ok(());
427                } else {
428                    polars_bail!(ComputeError: "could not parse '{}' with pattern '{:?}'", val, pattern)
429                }
430            };
431
432            buf.compiled = Some(infer);
433            buf.builder.append_value(parsed);
434            Ok(())
435        },
436        Err(err) => {
437            if ignore_errors {
438                buf.builder.append_null();
439                Ok(())
440            } else {
441                Err(err)
442            }
443        },
444    }
445}
446
447#[cfg(any(feature = "dtype-datetime", feature = "dtype-date"))]
448impl<T> ParsedBuffer for DatetimeField<T>
449where
450    T: PolarsNumericType,
451    DatetimeInfer<T>: TryFromWithUnit<Pattern> + StrpTimeParser<T::Native>,
452{
453    #[inline]
454    fn parse_bytes(
455        &mut self,
456        mut bytes: &[u8],
457        ignore_errors: bool,
458        needs_escaping: bool,
459        _missing_is_null: bool,
460        time_unit: Option<TimeUnit>,
461    ) -> PolarsResult<()> {
462        if needs_escaping && bytes.len() >= 2 {
463            bytes = &bytes[1..bytes.len() - 1]
464        }
465
466        if bytes.is_empty() {
467            // for types other than string `_missing_is_null` is irrelevant; we always append null
468            self.builder.append_null();
469            return Ok(());
470        }
471
472        match &mut self.compiled {
473            None => slow_datetime_parser(self, bytes, time_unit, ignore_errors),
474            Some(compiled) => {
475                match compiled.parse_bytes(bytes, time_unit) {
476                    Some(parsed) => {
477                        self.builder.append_value(parsed);
478                        Ok(())
479                    },
480                    // fall back on chrono parser
481                    // this is a lot slower, we need to do utf8 checking and use
482                    // the slower parser
483                    None => slow_datetime_parser(self, bytes, time_unit, ignore_errors),
484                }
485            },
486        }
487    }
488}
489
490pub fn init_buffers(
491    projection: &[usize],
492    capacity: usize,
493    schema: &Schema,
494    quote_char: Option<u8>,
495    encoding: CsvEncoding,
496    decimal_comma: bool,
497) -> PolarsResult<Vec<Buffer>> {
498    projection
499        .iter()
500        .map(|&i| {
501            let (name, dtype) = schema.get_at_index(i).unwrap();
502            let name = name.clone();
503            let builder = match dtype {
504                &DataType::Boolean => Buffer::Boolean(BooleanChunkedBuilder::new(name, capacity)),
505                #[cfg(feature = "dtype-i8")]
506                &DataType::Int8 => Buffer::Int8(PrimitiveChunkedBuilder::new(name, capacity)),
507                #[cfg(feature = "dtype-i16")]
508                &DataType::Int16 => Buffer::Int16(PrimitiveChunkedBuilder::new(name, capacity)),
509                &DataType::Int32 => Buffer::Int32(PrimitiveChunkedBuilder::new(name, capacity)),
510                &DataType::Int64 => Buffer::Int64(PrimitiveChunkedBuilder::new(name, capacity)),
511                #[cfg(feature = "dtype-i128")]
512                &DataType::Int128 => Buffer::Int128(PrimitiveChunkedBuilder::new(name, capacity)),
513                #[cfg(feature = "dtype-u8")]
514                &DataType::UInt8 => Buffer::UInt8(PrimitiveChunkedBuilder::new(name, capacity)),
515                #[cfg(feature = "dtype-u16")]
516                &DataType::UInt16 => Buffer::UInt16(PrimitiveChunkedBuilder::new(name, capacity)),
517                &DataType::UInt32 => Buffer::UInt32(PrimitiveChunkedBuilder::new(name, capacity)),
518                &DataType::UInt64 => Buffer::UInt64(PrimitiveChunkedBuilder::new(name, capacity)),
519                &DataType::Float32 => {
520                    if decimal_comma {
521                        Buffer::DecimalFloat32(
522                            PrimitiveChunkedBuilder::new(name, capacity),
523                            Default::default(),
524                        )
525                    } else {
526                        Buffer::Float32(PrimitiveChunkedBuilder::new(name, capacity))
527                    }
528                },
529                &DataType::Float64 => {
530                    if decimal_comma {
531                        Buffer::DecimalFloat64(
532                            PrimitiveChunkedBuilder::new(name, capacity),
533                            Default::default(),
534                        )
535                    } else {
536                        Buffer::Float64(PrimitiveChunkedBuilder::new(name, capacity))
537                    }
538                },
539                &DataType::String => {
540                    Buffer::Utf8(Utf8Field::new(name, capacity, quote_char, encoding))
541                },
542                #[cfg(feature = "dtype-datetime")]
543                DataType::Datetime(time_unit, time_zone) => Buffer::Datetime {
544                    buf: DatetimeField::new(name, capacity),
545                    time_unit: *time_unit,
546                    time_zone: time_zone.clone(),
547                },
548                #[cfg(feature = "dtype-date")]
549                &DataType::Date => Buffer::Date(DatetimeField::new(name, capacity)),
550                #[cfg(feature = "dtype-categorical")]
551                DataType::Categorical(_, _) | DataType::Enum(_, _) => {
552                    match dtype.cat_physical().unwrap() {
553                        CategoricalPhysical::U8 => {
554                            Buffer::Categorical8(CategoricalField::<Categorical8Type>::new(
555                                name,
556                                capacity,
557                                quote_char,
558                                dtype.clone(),
559                            ))
560                        },
561                        CategoricalPhysical::U16 => {
562                            Buffer::Categorical16(CategoricalField::<Categorical16Type>::new(
563                                name,
564                                capacity,
565                                quote_char,
566                                dtype.clone(),
567                            ))
568                        },
569                        CategoricalPhysical::U32 => {
570                            Buffer::Categorical32(CategoricalField::<Categorical32Type>::new(
571                                name,
572                                capacity,
573                                quote_char,
574                                dtype.clone(),
575                            ))
576                        },
577                    }
578                },
579                dt => polars_bail!(
580                    ComputeError: "unsupported data type when reading CSV: {} when reading CSV", dt,
581                ),
582            };
583            Ok(builder)
584        })
585        .collect()
586}
587
588#[allow(clippy::large_enum_variant)]
589pub enum Buffer {
590    Boolean(BooleanChunkedBuilder),
591    #[cfg(feature = "dtype-i8")]
592    Int8(PrimitiveChunkedBuilder<Int8Type>),
593    #[cfg(feature = "dtype-i16")]
594    Int16(PrimitiveChunkedBuilder<Int16Type>),
595    Int32(PrimitiveChunkedBuilder<Int32Type>),
596    Int64(PrimitiveChunkedBuilder<Int64Type>),
597    #[cfg(feature = "dtype-i128")]
598    Int128(PrimitiveChunkedBuilder<Int128Type>),
599    #[cfg(feature = "dtype-u8")]
600    UInt8(PrimitiveChunkedBuilder<UInt8Type>),
601    #[cfg(feature = "dtype-u16")]
602    UInt16(PrimitiveChunkedBuilder<UInt16Type>),
603    UInt32(PrimitiveChunkedBuilder<UInt32Type>),
604    UInt64(PrimitiveChunkedBuilder<UInt64Type>),
605    Float32(PrimitiveChunkedBuilder<Float32Type>),
606    Float64(PrimitiveChunkedBuilder<Float64Type>),
607    /// Stores the Utf8 fields and the total string length seen for that column
608    Utf8(Utf8Field),
609    #[cfg(feature = "dtype-datetime")]
610    Datetime {
611        buf: DatetimeField<Int64Type>,
612        time_unit: TimeUnit,
613        time_zone: Option<TimeZone>,
614    },
615    #[cfg(feature = "dtype-date")]
616    Date(DatetimeField<Int32Type>),
617    #[cfg(feature = "dtype-categorical")]
618    Categorical8(CategoricalField<Categorical8Type>),
619    #[cfg(feature = "dtype-categorical")]
620    Categorical16(CategoricalField<Categorical16Type>),
621    #[cfg(feature = "dtype-categorical")]
622    Categorical32(CategoricalField<Categorical32Type>),
623    DecimalFloat32(PrimitiveChunkedBuilder<Float32Type>, Vec<u8>),
624    DecimalFloat64(PrimitiveChunkedBuilder<Float64Type>, Vec<u8>),
625}
626
627impl Buffer {
628    pub fn into_series(self) -> PolarsResult<Series> {
629        let s = match self {
630            Buffer::Boolean(v) => v.finish().into_series(),
631            #[cfg(feature = "dtype-i8")]
632            Buffer::Int8(v) => v.finish().into_series(),
633            #[cfg(feature = "dtype-i16")]
634            Buffer::Int16(v) => v.finish().into_series(),
635            Buffer::Int32(v) => v.finish().into_series(),
636            Buffer::Int64(v) => v.finish().into_series(),
637            #[cfg(feature = "dtype-i128")]
638            Buffer::Int128(v) => v.finish().into_series(),
639            #[cfg(feature = "dtype-u8")]
640            Buffer::UInt8(v) => v.finish().into_series(),
641            #[cfg(feature = "dtype-u16")]
642            Buffer::UInt16(v) => v.finish().into_series(),
643            Buffer::UInt32(v) => v.finish().into_series(),
644            Buffer::UInt64(v) => v.finish().into_series(),
645            Buffer::Float32(v) => v.finish().into_series(),
646            Buffer::Float64(v) => v.finish().into_series(),
647            Buffer::DecimalFloat32(v, _) => v.finish().into_series(),
648            Buffer::DecimalFloat64(v, _) => v.finish().into_series(),
649            #[cfg(feature = "dtype-datetime")]
650            Buffer::Datetime {
651                buf,
652                time_unit,
653                time_zone,
654            } => buf
655                .builder
656                .finish()
657                .into_series()
658                .cast(&DataType::Datetime(time_unit, time_zone))
659                .unwrap(),
660            #[cfg(feature = "dtype-date")]
661            Buffer::Date(v) => v
662                .builder
663                .finish()
664                .into_series()
665                .cast(&DataType::Date)
666                .unwrap(),
667
668            Buffer::Utf8(v) => {
669                let arr = v.mutable.freeze();
670                StringChunked::with_chunk(v.name, unsafe { arr.to_utf8view_unchecked() })
671                    .into_series()
672            },
673            #[cfg(feature = "dtype-categorical")]
674            Buffer::Categorical8(buf) => buf.builder.finish().into_series(),
675            #[cfg(feature = "dtype-categorical")]
676            Buffer::Categorical16(buf) => buf.builder.finish().into_series(),
677            #[cfg(feature = "dtype-categorical")]
678            Buffer::Categorical32(buf) => buf.builder.finish().into_series(),
679        };
680        Ok(s)
681    }
682
683    pub fn add_null(&mut self, valid: bool) {
684        match self {
685            Buffer::Boolean(v) => v.append_null(),
686            #[cfg(feature = "dtype-i8")]
687            Buffer::Int8(v) => v.append_null(),
688            #[cfg(feature = "dtype-i16")]
689            Buffer::Int16(v) => v.append_null(),
690            Buffer::Int32(v) => v.append_null(),
691            Buffer::Int64(v) => v.append_null(),
692            #[cfg(feature = "dtype-i128")]
693            Buffer::Int128(v) => v.append_null(),
694            #[cfg(feature = "dtype-u8")]
695            Buffer::UInt8(v) => v.append_null(),
696            #[cfg(feature = "dtype-u16")]
697            Buffer::UInt16(v) => v.append_null(),
698            Buffer::UInt32(v) => v.append_null(),
699            Buffer::UInt64(v) => v.append_null(),
700            Buffer::Float32(v) => v.append_null(),
701            Buffer::Float64(v) => v.append_null(),
702            Buffer::DecimalFloat32(v, _) => v.append_null(),
703            Buffer::DecimalFloat64(v, _) => v.append_null(),
704            Buffer::Utf8(v) => {
705                if valid {
706                    v.mutable.push_value("")
707                } else {
708                    v.mutable.push_null()
709                }
710            },
711            #[cfg(feature = "dtype-datetime")]
712            Buffer::Datetime { buf, .. } => buf.builder.append_null(),
713            #[cfg(feature = "dtype-date")]
714            Buffer::Date(v) => v.builder.append_null(),
715            #[cfg(feature = "dtype-categorical")]
716            Buffer::Categorical8(buf) => buf.builder.append_null(),
717            #[cfg(feature = "dtype-categorical")]
718            Buffer::Categorical16(buf) => buf.builder.append_null(),
719            #[cfg(feature = "dtype-categorical")]
720            Buffer::Categorical32(buf) => buf.builder.append_null(),
721        };
722    }
723
724    pub fn dtype(&self) -> DataType {
725        match self {
726            Buffer::Boolean(_) => DataType::Boolean,
727            #[cfg(feature = "dtype-i8")]
728            Buffer::Int8(_) => DataType::Int8,
729            #[cfg(feature = "dtype-i16")]
730            Buffer::Int16(_) => DataType::Int16,
731            Buffer::Int32(_) => DataType::Int32,
732            Buffer::Int64(_) => DataType::Int64,
733            #[cfg(feature = "dtype-i128")]
734            Buffer::Int128(_) => DataType::Int128,
735            #[cfg(feature = "dtype-u8")]
736            Buffer::UInt8(_) => DataType::UInt8,
737            #[cfg(feature = "dtype-u16")]
738            Buffer::UInt16(_) => DataType::UInt16,
739            Buffer::UInt32(_) => DataType::UInt32,
740            Buffer::UInt64(_) => DataType::UInt64,
741            Buffer::Float32(_) | Buffer::DecimalFloat32(_, _) => DataType::Float32,
742            Buffer::Float64(_) | Buffer::DecimalFloat64(_, _) => DataType::Float64,
743            Buffer::Utf8(_) => DataType::String,
744            #[cfg(feature = "dtype-datetime")]
745            Buffer::Datetime { time_unit, .. } => DataType::Datetime(*time_unit, None),
746            #[cfg(feature = "dtype-date")]
747            Buffer::Date(_) => DataType::Date,
748            #[cfg(feature = "dtype-categorical")]
749            Buffer::Categorical8(buf) => buf.builder.dtype().clone(),
750            #[cfg(feature = "dtype-categorical")]
751            Buffer::Categorical16(buf) => buf.builder.dtype().clone(),
752            #[cfg(feature = "dtype-categorical")]
753            Buffer::Categorical32(buf) => buf.builder.dtype().clone(),
754        }
755    }
756
757    #[inline]
758    pub fn add(
759        &mut self,
760        bytes: &[u8],
761        ignore_errors: bool,
762        needs_escaping: bool,
763        missing_is_null: bool,
764    ) -> PolarsResult<()> {
765        use Buffer::*;
766        match self {
767            Boolean(buf) => <BooleanChunkedBuilder as ParsedBuffer>::parse_bytes(
768                buf,
769                bytes,
770                ignore_errors,
771                needs_escaping,
772                missing_is_null,
773                None,
774            ),
775            #[cfg(feature = "dtype-i8")]
776            Int8(buf) => <PrimitiveChunkedBuilder<Int8Type> as ParsedBuffer>::parse_bytes(
777                buf,
778                bytes,
779                ignore_errors,
780                needs_escaping,
781                missing_is_null,
782                None,
783            ),
784            #[cfg(feature = "dtype-i16")]
785            Int16(buf) => <PrimitiveChunkedBuilder<Int16Type> as ParsedBuffer>::parse_bytes(
786                buf,
787                bytes,
788                ignore_errors,
789                needs_escaping,
790                missing_is_null,
791                None,
792            ),
793            Int32(buf) => <PrimitiveChunkedBuilder<Int32Type> as ParsedBuffer>::parse_bytes(
794                buf,
795                bytes,
796                ignore_errors,
797                needs_escaping,
798                missing_is_null,
799                None,
800            ),
801            Int64(buf) => <PrimitiveChunkedBuilder<Int64Type> as ParsedBuffer>::parse_bytes(
802                buf,
803                bytes,
804                ignore_errors,
805                needs_escaping,
806                missing_is_null,
807                None,
808            ),
809            #[cfg(feature = "dtype-i128")]
810            Int128(buf) => <PrimitiveChunkedBuilder<Int128Type> as ParsedBuffer>::parse_bytes(
811                buf,
812                bytes,
813                ignore_errors,
814                needs_escaping,
815                missing_is_null,
816                None,
817            ),
818            #[cfg(feature = "dtype-u8")]
819            UInt8(buf) => <PrimitiveChunkedBuilder<UInt8Type> as ParsedBuffer>::parse_bytes(
820                buf,
821                bytes,
822                ignore_errors,
823                needs_escaping,
824                missing_is_null,
825                None,
826            ),
827            #[cfg(feature = "dtype-u16")]
828            UInt16(buf) => <PrimitiveChunkedBuilder<UInt16Type> as ParsedBuffer>::parse_bytes(
829                buf,
830                bytes,
831                ignore_errors,
832                needs_escaping,
833                missing_is_null,
834                None,
835            ),
836            UInt32(buf) => <PrimitiveChunkedBuilder<UInt32Type> as ParsedBuffer>::parse_bytes(
837                buf,
838                bytes,
839                ignore_errors,
840                needs_escaping,
841                missing_is_null,
842                None,
843            ),
844            UInt64(buf) => <PrimitiveChunkedBuilder<UInt64Type> as ParsedBuffer>::parse_bytes(
845                buf,
846                bytes,
847                ignore_errors,
848                needs_escaping,
849                missing_is_null,
850                None,
851            ),
852            Float32(buf) => <PrimitiveChunkedBuilder<Float32Type> as ParsedBuffer>::parse_bytes(
853                buf,
854                bytes,
855                ignore_errors,
856                needs_escaping,
857                missing_is_null,
858                None,
859            ),
860            Float64(buf) => <PrimitiveChunkedBuilder<Float64Type> as ParsedBuffer>::parse_bytes(
861                buf,
862                bytes,
863                ignore_errors,
864                needs_escaping,
865                missing_is_null,
866                None,
867            ),
868            DecimalFloat32(buf, scratch) => {
869                prepare_decimal_comma(bytes, scratch);
870                <PrimitiveChunkedBuilder<Float32Type> as ParsedBuffer>::parse_bytes(
871                    buf,
872                    scratch,
873                    ignore_errors,
874                    needs_escaping,
875                    missing_is_null,
876                    None,
877                )
878            },
879            DecimalFloat64(buf, scratch) => {
880                prepare_decimal_comma(bytes, scratch);
881                <PrimitiveChunkedBuilder<Float64Type> as ParsedBuffer>::parse_bytes(
882                    buf,
883                    scratch,
884                    ignore_errors,
885                    needs_escaping,
886                    missing_is_null,
887                    None,
888                )
889            },
890            Utf8(buf) => <Utf8Field as ParsedBuffer>::parse_bytes(
891                buf,
892                bytes,
893                ignore_errors,
894                needs_escaping,
895                missing_is_null,
896                None,
897            ),
898            #[cfg(feature = "dtype-datetime")]
899            Datetime { buf, time_unit, .. } => {
900                <DatetimeField<Int64Type> as ParsedBuffer>::parse_bytes(
901                    buf,
902                    bytes,
903                    ignore_errors,
904                    needs_escaping,
905                    missing_is_null,
906                    Some(*time_unit),
907                )
908            },
909            #[cfg(feature = "dtype-date")]
910            Date(buf) => <DatetimeField<Int32Type> as ParsedBuffer>::parse_bytes(
911                buf,
912                bytes,
913                ignore_errors,
914                needs_escaping,
915                missing_is_null,
916                None,
917            ),
918            #[cfg(feature = "dtype-categorical")]
919            Categorical8(buf) => {
920                buf.parse_bytes(bytes, ignore_errors, needs_escaping, missing_is_null, None)
921            },
922            #[cfg(feature = "dtype-categorical")]
923            Categorical16(buf) => {
924                buf.parse_bytes(bytes, ignore_errors, needs_escaping, missing_is_null, None)
925            },
926            #[cfg(feature = "dtype-categorical")]
927            Categorical32(buf) => {
928                buf.parse_bytes(bytes, ignore_errors, needs_escaping, missing_is_null, None)
929            },
930        }
931    }
932}
933
934#[inline]
935fn prepare_decimal_comma(bytes: &[u8], scratch: &mut Vec<u8>) {
936    scratch.clear();
937    scratch.reserve(bytes.len());
938
939    // SAFETY: we pre-allocated.
940    for &byte in bytes {
941        if byte == b',' {
942            unsafe { scratch.push_unchecked(b'.') }
943        } else {
944            unsafe { scratch.push_unchecked(byte) }
945        }
946    }
947}