polars_io/csv/read/
buffer.rs

1use arrow::array::MutableBinaryViewArray;
2#[cfg(feature = "dtype-decimal")]
3use polars_compute::decimal::str_to_dec128;
4#[cfg(feature = "dtype-categorical")]
5use polars_core::chunked_array::builder::CategoricalChunkedBuilder;
6use polars_core::prelude::*;
7use polars_error::to_compute_err;
8#[cfg(any(feature = "dtype-datetime", feature = "dtype-date"))]
9use polars_time::chunkedarray::string::Pattern;
10#[cfg(any(feature = "dtype-datetime", feature = "dtype-date"))]
11use polars_time::prelude::string::infer::{
12    DatetimeInfer, StrpTimeParser, TryFromWithUnit, infer_pattern_single,
13};
14#[cfg(feature = "dtype-f16")]
15use polars_utils::float16::pf16;
16use polars_utils::vec::PushUnchecked;
17
18use super::options::CsvEncoding;
19use super::parser::{could_be_whitespace_fast, skip_whitespace};
20use super::utils::escape_field;
21
22pub(crate) trait PrimitiveParser: PolarsNumericType {
23    fn parse(bytes: &[u8]) -> Option<Self::Native>;
24}
25
26#[cfg(feature = "dtype-f16")]
27impl PrimitiveParser for Float16Type {
28    #[inline]
29    fn parse(bytes: &[u8]) -> Option<pf16> {
30        use num_traits::FromPrimitive;
31
32        pf16::from_f32(fast_float2::parse(bytes).ok()?)
33    }
34}
35
36impl PrimitiveParser for Float32Type {
37    #[inline]
38    fn parse(bytes: &[u8]) -> Option<f32> {
39        fast_float2::parse(bytes).ok()
40    }
41}
42impl PrimitiveParser for Float64Type {
43    #[inline]
44    fn parse(bytes: &[u8]) -> Option<f64> {
45        fast_float2::parse(bytes).ok()
46    }
47}
48
49#[cfg(feature = "dtype-u8")]
50impl PrimitiveParser for UInt8Type {
51    #[inline]
52    fn parse(bytes: &[u8]) -> Option<u8> {
53        atoi_simd::parse_skipped(bytes).ok()
54    }
55}
56#[cfg(feature = "dtype-u16")]
57impl PrimitiveParser for UInt16Type {
58    #[inline]
59    fn parse(bytes: &[u8]) -> Option<u16> {
60        atoi_simd::parse_skipped(bytes).ok()
61    }
62}
63impl PrimitiveParser for UInt32Type {
64    #[inline]
65    fn parse(bytes: &[u8]) -> Option<u32> {
66        atoi_simd::parse_skipped(bytes).ok()
67    }
68}
69impl PrimitiveParser for UInt64Type {
70    #[inline]
71    fn parse(bytes: &[u8]) -> Option<u64> {
72        atoi_simd::parse_skipped(bytes).ok()
73    }
74}
75#[cfg(feature = "dtype-u128")]
76impl PrimitiveParser for UInt128Type {
77    #[inline]
78    fn parse(bytes: &[u8]) -> Option<u128> {
79        atoi_simd::parse_skipped(bytes).ok()
80    }
81}
82#[cfg(feature = "dtype-i8")]
83impl PrimitiveParser for Int8Type {
84    #[inline]
85    fn parse(bytes: &[u8]) -> Option<i8> {
86        atoi_simd::parse_skipped(bytes).ok()
87    }
88}
89#[cfg(feature = "dtype-i16")]
90impl PrimitiveParser for Int16Type {
91    #[inline]
92    fn parse(bytes: &[u8]) -> Option<i16> {
93        atoi_simd::parse_skipped(bytes).ok()
94    }
95}
96impl PrimitiveParser for Int32Type {
97    #[inline]
98    fn parse(bytes: &[u8]) -> Option<i32> {
99        atoi_simd::parse_skipped(bytes).ok()
100    }
101}
102impl PrimitiveParser for Int64Type {
103    #[inline]
104    fn parse(bytes: &[u8]) -> Option<i64> {
105        atoi_simd::parse_skipped(bytes).ok()
106    }
107}
108#[cfg(feature = "dtype-i128")]
109impl PrimitiveParser for Int128Type {
110    #[inline]
111    fn parse(bytes: &[u8]) -> Option<i128> {
112        atoi_simd::parse_skipped(bytes).ok()
113    }
114}
115
116trait ParsedBuffer {
117    fn parse_bytes(
118        &mut self,
119        bytes: &[u8],
120        ignore_errors: bool,
121        _needs_escaping: bool,
122        _missing_is_null: bool,
123        _time_unit: Option<TimeUnit>,
124    ) -> PolarsResult<()>;
125}
126
127impl<T> ParsedBuffer for PrimitiveChunkedBuilder<T>
128where
129    T: PolarsNumericType + PrimitiveParser,
130{
131    #[inline]
132    fn parse_bytes(
133        &mut self,
134        mut bytes: &[u8],
135        ignore_errors: bool,
136        needs_escaping: bool,
137        _missing_is_null: bool,
138        _time_unit: Option<TimeUnit>,
139    ) -> PolarsResult<()> {
140        if !bytes.is_empty() && needs_escaping {
141            bytes = &bytes[1..bytes.len() - 1];
142        }
143
144        if !bytes.is_empty() && could_be_whitespace_fast(bytes[0]) {
145            bytes = skip_whitespace(bytes);
146        }
147
148        if bytes.is_empty() {
149            self.append_null();
150            return Ok(());
151        }
152
153        match T::parse(bytes) {
154            Some(value) => self.append_value(value),
155            None => {
156                if ignore_errors {
157                    self.append_null()
158                } else {
159                    polars_bail!(ComputeError: "invalid primitive value found during CSV parsing")
160                }
161            },
162        }
163        Ok(())
164    }
165}
166
167pub struct Utf8Field {
168    name: PlSmallStr,
169    mutable: MutableBinaryViewArray<[u8]>,
170    scratch: Vec<u8>,
171    quote_char: u8,
172    encoding: CsvEncoding,
173}
174
175impl Utf8Field {
176    fn new(
177        name: PlSmallStr,
178        capacity: usize,
179        quote_char: Option<u8>,
180        encoding: CsvEncoding,
181    ) -> Self {
182        Self {
183            name,
184            mutable: MutableBinaryViewArray::with_capacity(capacity),
185            scratch: vec![],
186            quote_char: quote_char.unwrap_or(b'"'),
187            encoding,
188        }
189    }
190}
191
192#[inline]
193pub fn validate_utf8(bytes: &[u8]) -> bool {
194    simdutf8::basic::from_utf8(bytes).is_ok()
195}
196
197impl ParsedBuffer for Utf8Field {
198    #[inline]
199    fn parse_bytes(
200        &mut self,
201        bytes: &[u8],
202        ignore_errors: bool,
203        needs_escaping: bool,
204        missing_is_null: bool,
205        _time_unit: Option<TimeUnit>,
206    ) -> PolarsResult<()> {
207        if bytes.is_empty() {
208            if missing_is_null {
209                self.mutable.push_null()
210            } else {
211                self.mutable.push(Some([]))
212            }
213            return Ok(());
214        }
215
216        // note that one branch writes without updating the length, so we must do that later.
217        let escaped_bytes = if needs_escaping {
218            self.scratch.clear();
219            self.scratch.reserve(bytes.len());
220            polars_ensure!(bytes.len() > 1 && bytes.last() == Some(&self.quote_char), ComputeError: "invalid csv file\n\nField `{}` is not properly escaped.", std::str::from_utf8(bytes).map_err(to_compute_err)?);
221
222            // SAFETY:
223            // we just allocated enough capacity and data_len is correct.
224            unsafe {
225                let n_written =
226                    escape_field(bytes, self.quote_char, self.scratch.spare_capacity_mut());
227                self.scratch.set_len(n_written);
228            }
229
230            self.scratch.as_slice()
231        } else {
232            bytes
233        };
234
235        if matches!(self.encoding, CsvEncoding::LossyUtf8) | ignore_errors {
236            // It is important that this happens after escaping, as invalid escaped string can produce
237            // invalid utf8.
238            let parse_result = validate_utf8(escaped_bytes);
239
240            match parse_result {
241                true => {
242                    let value = escaped_bytes;
243                    self.mutable.push_value(value)
244                },
245                false => {
246                    if matches!(self.encoding, CsvEncoding::LossyUtf8) {
247                        // TODO! do this without allocating
248                        let s = String::from_utf8_lossy(escaped_bytes);
249                        self.mutable.push_value(s.as_ref().as_bytes())
250                    } else if ignore_errors {
251                        self.mutable.push_null()
252                    } else {
253                        // If field before escaping is valid utf8, the escaping is incorrect.
254                        if needs_escaping && validate_utf8(bytes) {
255                            polars_bail!(ComputeError: "string field is not properly escaped");
256                        } else {
257                            polars_bail!(ComputeError: "invalid utf-8 sequence");
258                        }
259                    }
260                },
261            }
262        } else {
263            self.mutable.push_value(escaped_bytes)
264        }
265
266        Ok(())
267    }
268}
269
270#[cfg(feature = "dtype-categorical")]
271pub struct CategoricalField<T: PolarsCategoricalType> {
272    escape_scratch: Vec<u8>,
273    quote_char: u8,
274    builder: CategoricalChunkedBuilder<T>,
275}
276
277#[cfg(feature = "dtype-categorical")]
278impl<T: PolarsCategoricalType> CategoricalField<T> {
279    fn new(name: PlSmallStr, capacity: usize, quote_char: Option<u8>, dtype: DataType) -> Self {
280        let mut builder = CategoricalChunkedBuilder::new(name, dtype);
281        builder.reserve(capacity);
282
283        Self {
284            escape_scratch: vec![],
285            quote_char: quote_char.unwrap_or(b'"'),
286            builder,
287        }
288    }
289
290    #[inline]
291    fn parse_bytes(
292        &mut self,
293        bytes: &[u8],
294        ignore_errors: bool,
295        needs_escaping: bool,
296        _missing_is_null: bool,
297        _time_unit: Option<TimeUnit>,
298    ) -> PolarsResult<()> {
299        if bytes.is_empty() {
300            self.builder.append_null();
301            return Ok(());
302        }
303        if validate_utf8(bytes) {
304            if needs_escaping {
305                polars_ensure!(bytes.len() > 1, ComputeError: "invalid csv file\n\nField `{}` is not properly escaped.", std::str::from_utf8(bytes).map_err(to_compute_err)?);
306                self.escape_scratch.clear();
307                self.escape_scratch.reserve(bytes.len());
308                // SAFETY:
309                // we just allocated enough capacity and data_len is correct.
310                unsafe {
311                    let n_written = escape_field(
312                        bytes,
313                        self.quote_char,
314                        self.escape_scratch.spare_capacity_mut(),
315                    );
316                    self.escape_scratch.set_len(n_written);
317                }
318
319                // SAFETY:
320                // just did utf8 check
321                let key = unsafe { std::str::from_utf8_unchecked(&self.escape_scratch) };
322                self.builder.append_str(key)?;
323            } else {
324                // SAFETY:
325                // just did utf8 check
326                let key = unsafe { std::str::from_utf8_unchecked(bytes) };
327                self.builder.append_str(key)?;
328            }
329        } else if ignore_errors {
330            self.builder.append_null()
331        } else {
332            polars_bail!(ComputeError: "invalid utf-8 sequence");
333        }
334        Ok(())
335    }
336}
337
338impl ParsedBuffer for BooleanChunkedBuilder {
339    #[inline]
340    fn parse_bytes(
341        &mut self,
342        bytes: &[u8],
343        ignore_errors: bool,
344        needs_escaping: bool,
345        _missing_is_null: bool,
346        _time_unit: Option<TimeUnit>,
347    ) -> PolarsResult<()> {
348        let bytes = if needs_escaping {
349            &bytes[1..bytes.len() - 1]
350        } else {
351            bytes
352        };
353        if bytes.eq_ignore_ascii_case(b"false") {
354            self.append_value(false);
355        } else if bytes.eq_ignore_ascii_case(b"true") {
356            self.append_value(true);
357        } else if ignore_errors || bytes.is_empty() {
358            self.append_null();
359        } else {
360            polars_bail!(
361                ComputeError: "error while parsing value {} as boolean",
362                String::from_utf8_lossy(bytes),
363            );
364        }
365        Ok(())
366    }
367}
368
369#[cfg(feature = "dtype-decimal")]
370pub struct DecimalField {
371    builder: PrimitiveChunkedBuilder<Int128Type>,
372    precision: usize,
373    scale: usize,
374    decimal_comma: bool,
375}
376
377#[cfg(feature = "dtype-decimal")]
378impl DecimalField {
379    fn new(
380        name: PlSmallStr,
381        capacity: usize,
382        precision: usize,
383        scale: usize,
384        decimal_comma: bool,
385    ) -> Self {
386        let builder = PrimitiveChunkedBuilder::<Int128Type>::new(name, capacity);
387        Self {
388            builder,
389            precision,
390            scale,
391            decimal_comma,
392        }
393    }
394}
395
396#[cfg(feature = "dtype-decimal")]
397impl ParsedBuffer for DecimalField {
398    #[inline]
399    fn parse_bytes(
400        &mut self,
401        mut bytes: &[u8],
402        ignore_errors: bool,
403        needs_escaping: bool,
404        _missing_is_null: bool,
405        _time_unit: Option<TimeUnit>,
406    ) -> PolarsResult<()> {
407        if !bytes.is_empty() && needs_escaping {
408            bytes = &bytes[1..bytes.len() - 1];
409        }
410
411        if !bytes.is_empty() && could_be_whitespace_fast(bytes[0]) {
412            bytes = skip_whitespace(bytes);
413        }
414
415        if bytes.is_empty() {
416            self.builder.append_null();
417            return Ok(());
418        }
419
420        match str_to_dec128(bytes, self.precision, self.scale, self.decimal_comma) {
421            Some(value) => self.builder.append_value(value),
422            None => {
423                if ignore_errors {
424                    self.builder.append_null()
425                } else {
426                    polars_bail!(ComputeError: "invalid decimal value found during CSV parsing")
427                }
428            },
429        }
430
431        Ok(())
432    }
433}
434
435#[cfg(any(feature = "dtype-datetime", feature = "dtype-date"))]
436pub struct DatetimeField<T: PolarsNumericType> {
437    compiled: Option<DatetimeInfer<T>>,
438    builder: PrimitiveChunkedBuilder<T>,
439}
440
441#[cfg(any(feature = "dtype-datetime", feature = "dtype-date"))]
442impl<T: PolarsNumericType> DatetimeField<T> {
443    fn new(name: PlSmallStr, capacity: usize) -> Self {
444        let builder = PrimitiveChunkedBuilder::<T>::new(name, capacity);
445        Self {
446            compiled: None,
447            builder,
448        }
449    }
450}
451
452#[cfg(any(feature = "dtype-datetime", feature = "dtype-date"))]
453fn slow_datetime_parser<T>(
454    buf: &mut DatetimeField<T>,
455    bytes: &[u8],
456    time_unit: Option<TimeUnit>,
457    ignore_errors: bool,
458) -> PolarsResult<()>
459where
460    T: PolarsNumericType,
461    DatetimeInfer<T>: TryFromWithUnit<Pattern>,
462{
463    let val = if bytes.is_ascii() {
464        // SAFETY:
465        // we just checked it is ascii
466        unsafe { std::str::from_utf8_unchecked(bytes) }
467    } else {
468        match std::str::from_utf8(bytes) {
469            Ok(val) => val,
470            Err(_) => {
471                if ignore_errors {
472                    buf.builder.append_null();
473                    return Ok(());
474                } else {
475                    polars_bail!(ComputeError: "invalid utf-8 sequence");
476                }
477            },
478        }
479    };
480
481    let pattern = match &buf.compiled {
482        Some(compiled) => compiled.pattern,
483        None => match infer_pattern_single(val) {
484            Some(pattern) => pattern,
485            None => {
486                if ignore_errors {
487                    buf.builder.append_null();
488                    return Ok(());
489                } else {
490                    polars_bail!(ComputeError: "could not find a 'date/datetime' pattern for '{}'", val)
491                }
492            },
493        },
494    };
495    match DatetimeInfer::try_from_with_unit(pattern, time_unit) {
496        Ok(mut infer) => {
497            let parsed = infer.parse(val);
498            let Some(parsed) = parsed else {
499                if ignore_errors {
500                    buf.builder.append_null();
501                    return Ok(());
502                } else {
503                    polars_bail!(ComputeError: "could not parse '{}' with pattern '{:?}'", val, pattern)
504                }
505            };
506
507            buf.compiled = Some(infer);
508            buf.builder.append_value(parsed);
509            Ok(())
510        },
511        Err(err) => {
512            if ignore_errors {
513                buf.builder.append_null();
514                Ok(())
515            } else {
516                Err(err)
517            }
518        },
519    }
520}
521
522#[cfg(any(feature = "dtype-datetime", feature = "dtype-date"))]
523impl<T> ParsedBuffer for DatetimeField<T>
524where
525    T: PolarsNumericType,
526    DatetimeInfer<T>: TryFromWithUnit<Pattern> + StrpTimeParser<T::Native>,
527{
528    #[inline]
529    fn parse_bytes(
530        &mut self,
531        mut bytes: &[u8],
532        ignore_errors: bool,
533        needs_escaping: bool,
534        _missing_is_null: bool,
535        time_unit: Option<TimeUnit>,
536    ) -> PolarsResult<()> {
537        if needs_escaping && bytes.len() >= 2 {
538            bytes = &bytes[1..bytes.len() - 1]
539        }
540
541        if bytes.is_empty() {
542            // for types other than string `_missing_is_null` is irrelevant; we always append null
543            self.builder.append_null();
544            return Ok(());
545        }
546
547        match &mut self.compiled {
548            None => slow_datetime_parser(self, bytes, time_unit, ignore_errors),
549            Some(compiled) => {
550                match compiled.parse_bytes(bytes, time_unit) {
551                    Some(parsed) => {
552                        self.builder.append_value(parsed);
553                        Ok(())
554                    },
555                    // fall back on chrono parser
556                    // this is a lot slower, we need to do utf8 checking and use
557                    // the slower parser
558                    None => slow_datetime_parser(self, bytes, time_unit, ignore_errors),
559                }
560            },
561        }
562    }
563}
564
565pub fn init_buffers(
566    projection: &[usize],
567    capacity: usize,
568    schema: &Schema,
569    quote_char: Option<u8>,
570    encoding: CsvEncoding,
571    decimal_comma: bool,
572) -> PolarsResult<Vec<Buffer>> {
573    projection
574        .iter()
575        .map(|&i| {
576            let (name, dtype) = schema.get_at_index(i).unwrap();
577            let name = name.clone();
578            let builder = match dtype {
579                &DataType::Boolean => Buffer::Boolean(BooleanChunkedBuilder::new(name, capacity)),
580                #[cfg(feature = "dtype-i8")]
581                &DataType::Int8 => Buffer::Int8(PrimitiveChunkedBuilder::new(name, capacity)),
582                #[cfg(feature = "dtype-i16")]
583                &DataType::Int16 => Buffer::Int16(PrimitiveChunkedBuilder::new(name, capacity)),
584                &DataType::Int32 => Buffer::Int32(PrimitiveChunkedBuilder::new(name, capacity)),
585                &DataType::Int64 => Buffer::Int64(PrimitiveChunkedBuilder::new(name, capacity)),
586                #[cfg(feature = "dtype-i128")]
587                &DataType::Int128 => Buffer::Int128(PrimitiveChunkedBuilder::new(name, capacity)),
588                #[cfg(feature = "dtype-u8")]
589                &DataType::UInt8 => Buffer::UInt8(PrimitiveChunkedBuilder::new(name, capacity)),
590                #[cfg(feature = "dtype-u16")]
591                &DataType::UInt16 => Buffer::UInt16(PrimitiveChunkedBuilder::new(name, capacity)),
592                &DataType::UInt32 => Buffer::UInt32(PrimitiveChunkedBuilder::new(name, capacity)),
593                &DataType::UInt64 => Buffer::UInt64(PrimitiveChunkedBuilder::new(name, capacity)),
594                #[cfg(feature = "dtype-u128")]
595                &DataType::UInt128 => Buffer::UInt128(PrimitiveChunkedBuilder::new(name, capacity)),
596                #[cfg(feature = "dtype-f16")]
597                &DataType::Float16 => {
598                    if decimal_comma {
599                        Buffer::DecimalFloat16(
600                            PrimitiveChunkedBuilder::new(name, capacity),
601                            Default::default(),
602                        )
603                    } else {
604                        Buffer::Float16(PrimitiveChunkedBuilder::new(name, capacity))
605                    }
606                },
607                &DataType::Float32 => {
608                    if decimal_comma {
609                        Buffer::DecimalFloat32(
610                            PrimitiveChunkedBuilder::new(name, capacity),
611                            Default::default(),
612                        )
613                    } else {
614                        Buffer::Float32(PrimitiveChunkedBuilder::new(name, capacity))
615                    }
616                },
617                &DataType::Float64 => {
618                    if decimal_comma {
619                        Buffer::DecimalFloat64(
620                            PrimitiveChunkedBuilder::new(name, capacity),
621                            Default::default(),
622                        )
623                    } else {
624                        Buffer::Float64(PrimitiveChunkedBuilder::new(name, capacity))
625                    }
626                },
627                #[cfg(feature = "dtype-decimal")]
628                &DataType::Decimal(precision, scale) => Buffer::Decimal(DecimalField::new(
629                    name,
630                    capacity,
631                    precision,
632                    scale,
633                    decimal_comma,
634                )),
635                &DataType::String => {
636                    Buffer::Utf8(Utf8Field::new(name, capacity, quote_char, encoding))
637                },
638                #[cfg(feature = "dtype-datetime")]
639                DataType::Datetime(time_unit, time_zone) => Buffer::Datetime {
640                    buf: DatetimeField::new(name, capacity),
641                    time_unit: *time_unit,
642                    time_zone: time_zone.clone(),
643                },
644                #[cfg(feature = "dtype-date")]
645                &DataType::Date => Buffer::Date(DatetimeField::new(name, capacity)),
646                #[cfg(feature = "dtype-categorical")]
647                DataType::Categorical(_, _) | DataType::Enum(_, _) => {
648                    match dtype.cat_physical().unwrap() {
649                        CategoricalPhysical::U8 => {
650                            Buffer::Categorical8(CategoricalField::<Categorical8Type>::new(
651                                name,
652                                capacity,
653                                quote_char,
654                                dtype.clone(),
655                            ))
656                        },
657                        CategoricalPhysical::U16 => {
658                            Buffer::Categorical16(CategoricalField::<Categorical16Type>::new(
659                                name,
660                                capacity,
661                                quote_char,
662                                dtype.clone(),
663                            ))
664                        },
665                        CategoricalPhysical::U32 => {
666                            Buffer::Categorical32(CategoricalField::<Categorical32Type>::new(
667                                name,
668                                capacity,
669                                quote_char,
670                                dtype.clone(),
671                            ))
672                        },
673                    }
674                },
675                dt => polars_bail!(
676                    ComputeError: "unsupported data type when reading CSV: {} when reading CSV", dt,
677                ),
678            };
679            Ok(builder)
680        })
681        .collect()
682}
683
684#[allow(clippy::large_enum_variant)]
685pub enum Buffer {
686    Boolean(BooleanChunkedBuilder),
687    #[cfg(feature = "dtype-i8")]
688    Int8(PrimitiveChunkedBuilder<Int8Type>),
689    #[cfg(feature = "dtype-i16")]
690    Int16(PrimitiveChunkedBuilder<Int16Type>),
691    Int32(PrimitiveChunkedBuilder<Int32Type>),
692    Int64(PrimitiveChunkedBuilder<Int64Type>),
693    #[cfg(feature = "dtype-i128")]
694    Int128(PrimitiveChunkedBuilder<Int128Type>),
695    #[cfg(feature = "dtype-u8")]
696    UInt8(PrimitiveChunkedBuilder<UInt8Type>),
697    #[cfg(feature = "dtype-u16")]
698    UInt16(PrimitiveChunkedBuilder<UInt16Type>),
699    UInt32(PrimitiveChunkedBuilder<UInt32Type>),
700    UInt64(PrimitiveChunkedBuilder<UInt64Type>),
701    #[cfg(feature = "dtype-u128")]
702    UInt128(PrimitiveChunkedBuilder<UInt128Type>),
703    #[cfg(feature = "dtype-f16")]
704    Float16(PrimitiveChunkedBuilder<Float16Type>),
705    Float32(PrimitiveChunkedBuilder<Float32Type>),
706    Float64(PrimitiveChunkedBuilder<Float64Type>),
707    #[cfg(feature = "dtype-decimal")]
708    Decimal(DecimalField),
709    /// Stores the Utf8 fields and the total string length seen for that column
710    Utf8(Utf8Field),
711    #[cfg(feature = "dtype-datetime")]
712    Datetime {
713        buf: DatetimeField<Int64Type>,
714        time_unit: TimeUnit,
715        time_zone: Option<TimeZone>,
716    },
717    #[cfg(feature = "dtype-date")]
718    Date(DatetimeField<Int32Type>),
719    #[cfg(feature = "dtype-categorical")]
720    Categorical8(CategoricalField<Categorical8Type>),
721    #[cfg(feature = "dtype-categorical")]
722    Categorical16(CategoricalField<Categorical16Type>),
723    #[cfg(feature = "dtype-categorical")]
724    Categorical32(CategoricalField<Categorical32Type>),
725    #[cfg(feature = "dtype-f16")]
726    DecimalFloat16(PrimitiveChunkedBuilder<Float16Type>, Vec<u8>),
727    DecimalFloat32(PrimitiveChunkedBuilder<Float32Type>, Vec<u8>),
728    DecimalFloat64(PrimitiveChunkedBuilder<Float64Type>, Vec<u8>),
729}
730
731impl Buffer {
732    pub fn into_series(self) -> PolarsResult<Series> {
733        let s = match self {
734            Buffer::Boolean(v) => v.finish().into_series(),
735            #[cfg(feature = "dtype-i8")]
736            Buffer::Int8(v) => v.finish().into_series(),
737            #[cfg(feature = "dtype-i16")]
738            Buffer::Int16(v) => v.finish().into_series(),
739            Buffer::Int32(v) => v.finish().into_series(),
740            Buffer::Int64(v) => v.finish().into_series(),
741            #[cfg(feature = "dtype-i128")]
742            Buffer::Int128(v) => v.finish().into_series(),
743            #[cfg(feature = "dtype-u8")]
744            Buffer::UInt8(v) => v.finish().into_series(),
745            #[cfg(feature = "dtype-u16")]
746            Buffer::UInt16(v) => v.finish().into_series(),
747            Buffer::UInt32(v) => v.finish().into_series(),
748            Buffer::UInt64(v) => v.finish().into_series(),
749            #[cfg(feature = "dtype-u128")]
750            Buffer::UInt128(v) => v.finish().into_series(),
751            #[cfg(feature = "dtype-f16")]
752            Buffer::Float16(v) => v.finish().into_series(),
753            Buffer::Float32(v) => v.finish().into_series(),
754            Buffer::Float64(v) => v.finish().into_series(),
755            #[cfg(feature = "dtype-f16")]
756            Buffer::DecimalFloat16(v, _) => v.finish().into_series(),
757            Buffer::DecimalFloat32(v, _) => v.finish().into_series(),
758            Buffer::DecimalFloat64(v, _) => v.finish().into_series(),
759            #[cfg(feature = "dtype-decimal")]
760            Buffer::Decimal(DecimalField {
761                builder,
762                precision,
763                scale,
764                ..
765            }) => unsafe {
766                builder
767                    .finish()
768                    .into_series()
769                    .from_physical_unchecked(&DataType::Decimal(precision, scale))
770                    .unwrap()
771            },
772            #[cfg(feature = "dtype-datetime")]
773            Buffer::Datetime {
774                buf,
775                time_unit,
776                time_zone,
777            } => buf
778                .builder
779                .finish()
780                .into_series()
781                .cast(&DataType::Datetime(time_unit, time_zone))
782                .unwrap(),
783            #[cfg(feature = "dtype-date")]
784            Buffer::Date(v) => v
785                .builder
786                .finish()
787                .into_series()
788                .cast(&DataType::Date)
789                .unwrap(),
790
791            Buffer::Utf8(v) => {
792                let arr = v.mutable.freeze();
793                StringChunked::with_chunk(v.name, unsafe { arr.to_utf8view_unchecked() })
794                    .into_series()
795            },
796            #[cfg(feature = "dtype-categorical")]
797            Buffer::Categorical8(buf) => buf.builder.finish().into_series(),
798            #[cfg(feature = "dtype-categorical")]
799            Buffer::Categorical16(buf) => buf.builder.finish().into_series(),
800            #[cfg(feature = "dtype-categorical")]
801            Buffer::Categorical32(buf) => buf.builder.finish().into_series(),
802        };
803        Ok(s)
804    }
805
806    pub fn add_null(&mut self, valid: bool) {
807        match self {
808            Buffer::Boolean(v) => v.append_null(),
809            #[cfg(feature = "dtype-i8")]
810            Buffer::Int8(v) => v.append_null(),
811            #[cfg(feature = "dtype-i16")]
812            Buffer::Int16(v) => v.append_null(),
813            Buffer::Int32(v) => v.append_null(),
814            Buffer::Int64(v) => v.append_null(),
815            #[cfg(feature = "dtype-i128")]
816            Buffer::Int128(v) => v.append_null(),
817            #[cfg(feature = "dtype-u8")]
818            Buffer::UInt8(v) => v.append_null(),
819            #[cfg(feature = "dtype-u16")]
820            Buffer::UInt16(v) => v.append_null(),
821            Buffer::UInt32(v) => v.append_null(),
822            Buffer::UInt64(v) => v.append_null(),
823            #[cfg(feature = "dtype-u128")]
824            Buffer::UInt128(v) => v.append_null(),
825            #[cfg(feature = "dtype-f16")]
826            Buffer::Float16(v) => v.append_null(),
827            Buffer::Float32(v) => v.append_null(),
828            Buffer::Float64(v) => v.append_null(),
829            #[cfg(feature = "dtype-decimal")]
830            Buffer::Decimal(buf) => buf.builder.append_null(),
831            #[cfg(feature = "dtype-f16")]
832            Buffer::DecimalFloat16(v, _) => v.append_null(),
833            Buffer::DecimalFloat32(v, _) => v.append_null(),
834            Buffer::DecimalFloat64(v, _) => v.append_null(),
835            Buffer::Utf8(v) => {
836                if valid {
837                    v.mutable.push_value("")
838                } else {
839                    v.mutable.push_null()
840                }
841            },
842            #[cfg(feature = "dtype-datetime")]
843            Buffer::Datetime { buf, .. } => buf.builder.append_null(),
844            #[cfg(feature = "dtype-date")]
845            Buffer::Date(v) => v.builder.append_null(),
846            #[cfg(feature = "dtype-categorical")]
847            Buffer::Categorical8(buf) => buf.builder.append_null(),
848            #[cfg(feature = "dtype-categorical")]
849            Buffer::Categorical16(buf) => buf.builder.append_null(),
850            #[cfg(feature = "dtype-categorical")]
851            Buffer::Categorical32(buf) => buf.builder.append_null(),
852        };
853    }
854
855    pub fn dtype(&self) -> DataType {
856        match self {
857            Buffer::Boolean(_) => DataType::Boolean,
858            #[cfg(feature = "dtype-i8")]
859            Buffer::Int8(_) => DataType::Int8,
860            #[cfg(feature = "dtype-i16")]
861            Buffer::Int16(_) => DataType::Int16,
862            Buffer::Int32(_) => DataType::Int32,
863            Buffer::Int64(_) => DataType::Int64,
864            #[cfg(feature = "dtype-i128")]
865            Buffer::Int128(_) => DataType::Int128,
866            #[cfg(feature = "dtype-u8")]
867            Buffer::UInt8(_) => DataType::UInt8,
868            #[cfg(feature = "dtype-u16")]
869            Buffer::UInt16(_) => DataType::UInt16,
870            Buffer::UInt32(_) => DataType::UInt32,
871            Buffer::UInt64(_) => DataType::UInt64,
872            #[cfg(feature = "dtype-u128")]
873            Buffer::UInt128(_) => DataType::UInt128,
874            #[cfg(feature = "dtype-f16")]
875            Buffer::Float16(_) | Buffer::DecimalFloat16(_, _) => DataType::Float16,
876            Buffer::Float32(_) | Buffer::DecimalFloat32(_, _) => DataType::Float32,
877            Buffer::Float64(_) | Buffer::DecimalFloat64(_, _) => DataType::Float64,
878            #[cfg(feature = "dtype-decimal")]
879            Buffer::Decimal(DecimalField {
880                precision, scale, ..
881            }) => DataType::Decimal(*precision, *scale),
882            Buffer::Utf8(_) => DataType::String,
883            #[cfg(feature = "dtype-datetime")]
884            Buffer::Datetime { time_unit, .. } => DataType::Datetime(*time_unit, None),
885            #[cfg(feature = "dtype-date")]
886            Buffer::Date(_) => DataType::Date,
887            #[cfg(feature = "dtype-categorical")]
888            Buffer::Categorical8(buf) => buf.builder.dtype().clone(),
889            #[cfg(feature = "dtype-categorical")]
890            Buffer::Categorical16(buf) => buf.builder.dtype().clone(),
891            #[cfg(feature = "dtype-categorical")]
892            Buffer::Categorical32(buf) => buf.builder.dtype().clone(),
893        }
894    }
895
896    #[inline]
897    pub fn add(
898        &mut self,
899        bytes: &[u8],
900        ignore_errors: bool,
901        needs_escaping: bool,
902        missing_is_null: bool,
903    ) -> PolarsResult<()> {
904        use Buffer::*;
905        match self {
906            Boolean(buf) => <BooleanChunkedBuilder as ParsedBuffer>::parse_bytes(
907                buf,
908                bytes,
909                ignore_errors,
910                needs_escaping,
911                missing_is_null,
912                None,
913            ),
914            #[cfg(feature = "dtype-i8")]
915            Int8(buf) => <PrimitiveChunkedBuilder<Int8Type> as ParsedBuffer>::parse_bytes(
916                buf,
917                bytes,
918                ignore_errors,
919                needs_escaping,
920                missing_is_null,
921                None,
922            ),
923            #[cfg(feature = "dtype-i16")]
924            Int16(buf) => <PrimitiveChunkedBuilder<Int16Type> as ParsedBuffer>::parse_bytes(
925                buf,
926                bytes,
927                ignore_errors,
928                needs_escaping,
929                missing_is_null,
930                None,
931            ),
932            Int32(buf) => <PrimitiveChunkedBuilder<Int32Type> as ParsedBuffer>::parse_bytes(
933                buf,
934                bytes,
935                ignore_errors,
936                needs_escaping,
937                missing_is_null,
938                None,
939            ),
940            Int64(buf) => <PrimitiveChunkedBuilder<Int64Type> as ParsedBuffer>::parse_bytes(
941                buf,
942                bytes,
943                ignore_errors,
944                needs_escaping,
945                missing_is_null,
946                None,
947            ),
948            #[cfg(feature = "dtype-i128")]
949            Int128(buf) => <PrimitiveChunkedBuilder<Int128Type> as ParsedBuffer>::parse_bytes(
950                buf,
951                bytes,
952                ignore_errors,
953                needs_escaping,
954                missing_is_null,
955                None,
956            ),
957            #[cfg(feature = "dtype-u8")]
958            UInt8(buf) => <PrimitiveChunkedBuilder<UInt8Type> as ParsedBuffer>::parse_bytes(
959                buf,
960                bytes,
961                ignore_errors,
962                needs_escaping,
963                missing_is_null,
964                None,
965            ),
966            #[cfg(feature = "dtype-u16")]
967            UInt16(buf) => <PrimitiveChunkedBuilder<UInt16Type> as ParsedBuffer>::parse_bytes(
968                buf,
969                bytes,
970                ignore_errors,
971                needs_escaping,
972                missing_is_null,
973                None,
974            ),
975            UInt32(buf) => <PrimitiveChunkedBuilder<UInt32Type> as ParsedBuffer>::parse_bytes(
976                buf,
977                bytes,
978                ignore_errors,
979                needs_escaping,
980                missing_is_null,
981                None,
982            ),
983            UInt64(buf) => <PrimitiveChunkedBuilder<UInt64Type> as ParsedBuffer>::parse_bytes(
984                buf,
985                bytes,
986                ignore_errors,
987                needs_escaping,
988                missing_is_null,
989                None,
990            ),
991            #[cfg(feature = "dtype-u128")]
992            UInt128(buf) => <PrimitiveChunkedBuilder<UInt128Type> as ParsedBuffer>::parse_bytes(
993                buf,
994                bytes,
995                ignore_errors,
996                needs_escaping,
997                missing_is_null,
998                None,
999            ),
1000            #[cfg(feature = "dtype-f16")]
1001            Float16(buf) => <PrimitiveChunkedBuilder<Float16Type> as ParsedBuffer>::parse_bytes(
1002                buf,
1003                bytes,
1004                ignore_errors,
1005                needs_escaping,
1006                missing_is_null,
1007                None,
1008            ),
1009            Float32(buf) => <PrimitiveChunkedBuilder<Float32Type> as ParsedBuffer>::parse_bytes(
1010                buf,
1011                bytes,
1012                ignore_errors,
1013                needs_escaping,
1014                missing_is_null,
1015                None,
1016            ),
1017            Float64(buf) => <PrimitiveChunkedBuilder<Float64Type> as ParsedBuffer>::parse_bytes(
1018                buf,
1019                bytes,
1020                ignore_errors,
1021                needs_escaping,
1022                missing_is_null,
1023                None,
1024            ),
1025            #[cfg(feature = "dtype-f16")]
1026            DecimalFloat16(buf, scratch) => {
1027                prepare_decimal_comma(bytes, scratch);
1028                <PrimitiveChunkedBuilder<Float16Type> as ParsedBuffer>::parse_bytes(
1029                    buf,
1030                    scratch,
1031                    ignore_errors,
1032                    needs_escaping,
1033                    missing_is_null,
1034                    None,
1035                )
1036            },
1037            DecimalFloat32(buf, scratch) => {
1038                prepare_decimal_comma(bytes, scratch);
1039                <PrimitiveChunkedBuilder<Float32Type> as ParsedBuffer>::parse_bytes(
1040                    buf,
1041                    scratch,
1042                    ignore_errors,
1043                    needs_escaping,
1044                    missing_is_null,
1045                    None,
1046                )
1047            },
1048            DecimalFloat64(buf, scratch) => {
1049                prepare_decimal_comma(bytes, scratch);
1050                <PrimitiveChunkedBuilder<Float64Type> as ParsedBuffer>::parse_bytes(
1051                    buf,
1052                    scratch,
1053                    ignore_errors,
1054                    needs_escaping,
1055                    missing_is_null,
1056                    None,
1057                )
1058            },
1059            #[cfg(feature = "dtype-decimal")]
1060            Decimal(buf) => <DecimalField as ParsedBuffer>::parse_bytes(
1061                buf,
1062                bytes,
1063                ignore_errors,
1064                needs_escaping,
1065                missing_is_null,
1066                None,
1067            ),
1068            Utf8(buf) => <Utf8Field as ParsedBuffer>::parse_bytes(
1069                buf,
1070                bytes,
1071                ignore_errors,
1072                needs_escaping,
1073                missing_is_null,
1074                None,
1075            ),
1076            #[cfg(feature = "dtype-datetime")]
1077            Datetime { buf, time_unit, .. } => {
1078                <DatetimeField<Int64Type> as ParsedBuffer>::parse_bytes(
1079                    buf,
1080                    bytes,
1081                    ignore_errors,
1082                    needs_escaping,
1083                    missing_is_null,
1084                    Some(*time_unit),
1085                )
1086            },
1087            #[cfg(feature = "dtype-date")]
1088            Date(buf) => <DatetimeField<Int32Type> as ParsedBuffer>::parse_bytes(
1089                buf,
1090                bytes,
1091                ignore_errors,
1092                needs_escaping,
1093                missing_is_null,
1094                None,
1095            ),
1096            #[cfg(feature = "dtype-categorical")]
1097            Categorical8(buf) => {
1098                buf.parse_bytes(bytes, ignore_errors, needs_escaping, missing_is_null, None)
1099            },
1100            #[cfg(feature = "dtype-categorical")]
1101            Categorical16(buf) => {
1102                buf.parse_bytes(bytes, ignore_errors, needs_escaping, missing_is_null, None)
1103            },
1104            #[cfg(feature = "dtype-categorical")]
1105            Categorical32(buf) => {
1106                buf.parse_bytes(bytes, ignore_errors, needs_escaping, missing_is_null, None)
1107            },
1108        }
1109    }
1110}
1111
1112#[inline]
1113fn prepare_decimal_comma(bytes: &[u8], scratch: &mut Vec<u8>) {
1114    scratch.clear();
1115    scratch.reserve(bytes.len());
1116
1117    // SAFETY: we pre-allocated.
1118    for &byte in bytes {
1119        if byte == b',' {
1120            unsafe { scratch.push_unchecked(b'.') }
1121        } else {
1122            unsafe { scratch.push_unchecked(byte) }
1123        }
1124    }
1125}