polars_io/csv/read/
buffer.rs

1use arrow::array::MutableBinaryViewArray;
2use polars_core::prelude::*;
3use polars_error::to_compute_err;
4#[cfg(any(feature = "dtype-datetime", feature = "dtype-date"))]
5use polars_time::chunkedarray::string::Pattern;
6#[cfg(any(feature = "dtype-datetime", feature = "dtype-date"))]
7use polars_time::prelude::string::infer::{
8    DatetimeInfer, StrpTimeParser, TryFromWithUnit, infer_pattern_single,
9};
10use polars_utils::vec::PushUnchecked;
11
12use super::options::CsvEncoding;
13use super::parser::{is_whitespace, skip_whitespace};
14use super::utils::escape_field;
15
16pub(crate) trait PrimitiveParser: PolarsNumericType {
17    fn parse(bytes: &[u8]) -> Option<Self::Native>;
18}
19
20impl PrimitiveParser for Float32Type {
21    #[inline]
22    fn parse(bytes: &[u8]) -> Option<f32> {
23        fast_float2::parse(bytes).ok()
24    }
25}
26impl PrimitiveParser for Float64Type {
27    #[inline]
28    fn parse(bytes: &[u8]) -> Option<f64> {
29        fast_float2::parse(bytes).ok()
30    }
31}
32
33#[cfg(feature = "dtype-u8")]
34impl PrimitiveParser for UInt8Type {
35    #[inline]
36    fn parse(bytes: &[u8]) -> Option<u8> {
37        atoi_simd::parse_skipped(bytes).ok()
38    }
39}
40#[cfg(feature = "dtype-u16")]
41impl PrimitiveParser for UInt16Type {
42    #[inline]
43    fn parse(bytes: &[u8]) -> Option<u16> {
44        atoi_simd::parse_skipped(bytes).ok()
45    }
46}
47impl PrimitiveParser for UInt32Type {
48    #[inline]
49    fn parse(bytes: &[u8]) -> Option<u32> {
50        atoi_simd::parse_skipped(bytes).ok()
51    }
52}
53impl PrimitiveParser for UInt64Type {
54    #[inline]
55    fn parse(bytes: &[u8]) -> Option<u64> {
56        atoi_simd::parse_skipped(bytes).ok()
57    }
58}
59#[cfg(feature = "dtype-i8")]
60impl PrimitiveParser for Int8Type {
61    #[inline]
62    fn parse(bytes: &[u8]) -> Option<i8> {
63        atoi_simd::parse_skipped(bytes).ok()
64    }
65}
66#[cfg(feature = "dtype-i16")]
67impl PrimitiveParser for Int16Type {
68    #[inline]
69    fn parse(bytes: &[u8]) -> Option<i16> {
70        atoi_simd::parse_skipped(bytes).ok()
71    }
72}
73impl PrimitiveParser for Int32Type {
74    #[inline]
75    fn parse(bytes: &[u8]) -> Option<i32> {
76        atoi_simd::parse_skipped(bytes).ok()
77    }
78}
79impl PrimitiveParser for Int64Type {
80    #[inline]
81    fn parse(bytes: &[u8]) -> Option<i64> {
82        atoi_simd::parse_skipped(bytes).ok()
83    }
84}
85#[cfg(feature = "dtype-i128")]
86impl PrimitiveParser for Int128Type {
87    #[inline]
88    fn parse(bytes: &[u8]) -> Option<i128> {
89        atoi_simd::parse_skipped(bytes).ok()
90    }
91}
92
93trait ParsedBuffer {
94    fn parse_bytes(
95        &mut self,
96        bytes: &[u8],
97        ignore_errors: bool,
98        _needs_escaping: bool,
99        _missing_is_null: bool,
100        _time_unit: Option<TimeUnit>,
101    ) -> PolarsResult<()>;
102}
103
104impl<T> ParsedBuffer for PrimitiveChunkedBuilder<T>
105where
106    T: PolarsNumericType + PrimitiveParser,
107{
108    #[inline]
109    fn parse_bytes(
110        &mut self,
111        bytes: &[u8],
112        ignore_errors: bool,
113        needs_escaping: bool,
114        _missing_is_null: bool,
115        _time_unit: Option<TimeUnit>,
116    ) -> PolarsResult<()> {
117        if bytes.is_empty() {
118            self.append_null()
119        } else {
120            let bytes = if needs_escaping {
121                &bytes[1..bytes.len() - 1]
122            } else {
123                bytes
124            };
125
126            // legacy comment (remember this if you decide to use Results again):
127            // its faster to work on options.
128            // if we need to throw an error, we parse again to be able to throw the error
129
130            match T::parse(bytes) {
131                Some(value) => self.append_value(value),
132                None => {
133                    // try again without whitespace
134                    if !bytes.is_empty() && is_whitespace(bytes[0]) {
135                        let bytes = skip_whitespace(bytes);
136                        return self.parse_bytes(
137                            bytes,
138                            ignore_errors,
139                            false, // escaping was already done
140                            _missing_is_null,
141                            None,
142                        );
143                    }
144                    polars_ensure!(
145                        bytes.is_empty() || ignore_errors,
146                        ComputeError: "remaining bytes non-empty",
147                    );
148                    self.append_null()
149                },
150            };
151        }
152        Ok(())
153    }
154}
155
156pub struct Utf8Field {
157    name: PlSmallStr,
158    mutable: MutableBinaryViewArray<[u8]>,
159    scratch: Vec<u8>,
160    quote_char: u8,
161    encoding: CsvEncoding,
162}
163
164impl Utf8Field {
165    fn new(
166        name: PlSmallStr,
167        capacity: usize,
168        quote_char: Option<u8>,
169        encoding: CsvEncoding,
170    ) -> Self {
171        Self {
172            name,
173            mutable: MutableBinaryViewArray::with_capacity(capacity),
174            scratch: vec![],
175            quote_char: quote_char.unwrap_or(b'"'),
176            encoding,
177        }
178    }
179}
180
181#[inline]
182pub fn validate_utf8(bytes: &[u8]) -> bool {
183    simdutf8::basic::from_utf8(bytes).is_ok()
184}
185
186impl ParsedBuffer for Utf8Field {
187    #[inline]
188    fn parse_bytes(
189        &mut self,
190        bytes: &[u8],
191        ignore_errors: bool,
192        needs_escaping: bool,
193        missing_is_null: bool,
194        _time_unit: Option<TimeUnit>,
195    ) -> PolarsResult<()> {
196        if bytes.is_empty() {
197            if missing_is_null {
198                self.mutable.push_null()
199            } else {
200                self.mutable.push(Some([]))
201            }
202            return Ok(());
203        }
204
205        // note that one branch writes without updating the length, so we must do that later.
206        let escaped_bytes = if needs_escaping {
207            self.scratch.clear();
208            self.scratch.reserve(bytes.len());
209            polars_ensure!(bytes.len() > 1 && bytes.last() == Some(&self.quote_char), ComputeError: "invalid csv file\n\nField `{}` is not properly escaped.", std::str::from_utf8(bytes).map_err(to_compute_err)?);
210
211            // SAFETY:
212            // we just allocated enough capacity and data_len is correct.
213            unsafe {
214                let n_written =
215                    escape_field(bytes, self.quote_char, self.scratch.spare_capacity_mut());
216                self.scratch.set_len(n_written);
217            }
218
219            self.scratch.as_slice()
220        } else {
221            bytes
222        };
223
224        if matches!(self.encoding, CsvEncoding::LossyUtf8) | ignore_errors {
225            // It is important that this happens after escaping, as invalid escaped string can produce
226            // invalid utf8.
227            let parse_result = validate_utf8(escaped_bytes);
228
229            match parse_result {
230                true => {
231                    let value = escaped_bytes;
232                    self.mutable.push_value(value)
233                },
234                false => {
235                    if matches!(self.encoding, CsvEncoding::LossyUtf8) {
236                        // TODO! do this without allocating
237                        let s = String::from_utf8_lossy(escaped_bytes);
238                        self.mutable.push_value(s.as_ref().as_bytes())
239                    } else if ignore_errors {
240                        self.mutable.push_null()
241                    } else {
242                        // If field before escaping is valid utf8, the escaping is incorrect.
243                        if needs_escaping && validate_utf8(bytes) {
244                            polars_bail!(ComputeError: "string field is not properly escaped");
245                        } else {
246                            polars_bail!(ComputeError: "invalid utf-8 sequence");
247                        }
248                    }
249                },
250            }
251        } else {
252            self.mutable.push_value(escaped_bytes)
253        }
254
255        Ok(())
256    }
257}
258
259#[cfg(not(feature = "dtype-categorical"))]
260pub struct CategoricalField {
261    phantom: std::marker::PhantomData<u8>,
262}
263
264#[cfg(feature = "dtype-categorical")]
265pub struct CategoricalField {
266    escape_scratch: Vec<u8>,
267    quote_char: u8,
268    builder: CategoricalChunkedBuilder,
269    is_enum: bool,
270}
271
272#[cfg(feature = "dtype-categorical")]
273impl CategoricalField {
274    fn new(
275        name: PlSmallStr,
276        capacity: usize,
277        quote_char: Option<u8>,
278        ordering: CategoricalOrdering,
279    ) -> Self {
280        let builder = CategoricalChunkedBuilder::new(name, capacity, ordering);
281
282        Self {
283            escape_scratch: vec![],
284            quote_char: quote_char.unwrap_or(b'"'),
285            builder,
286            is_enum: false,
287        }
288    }
289
290    fn new_enum(quote_char: Option<u8>, builder: CategoricalChunkedBuilder) -> Self {
291        Self {
292            escape_scratch: vec![],
293            quote_char: quote_char.unwrap_or(b'"'),
294            builder,
295            is_enum: true,
296        }
297    }
298
299    #[inline]
300    fn parse_bytes(
301        &mut self,
302        bytes: &[u8],
303        ignore_errors: bool,
304        needs_escaping: bool,
305        _missing_is_null: bool,
306        _time_unit: Option<TimeUnit>,
307    ) -> PolarsResult<()> {
308        if bytes.is_empty() {
309            self.builder.append_null();
310            return Ok(());
311        }
312        if validate_utf8(bytes) {
313            if needs_escaping {
314                polars_ensure!(bytes.len() > 1, ComputeError: "invalid csv file\n\nField `{}` is not properly escaped.", std::str::from_utf8(bytes).map_err(to_compute_err)?);
315                self.escape_scratch.clear();
316                self.escape_scratch.reserve(bytes.len());
317                // SAFETY:
318                // we just allocated enough capacity and data_len is correct.
319                unsafe {
320                    let n_written = escape_field(
321                        bytes,
322                        self.quote_char,
323                        self.escape_scratch.spare_capacity_mut(),
324                    );
325                    self.escape_scratch.set_len(n_written);
326                }
327
328                // SAFETY:
329                // just did utf8 check
330                let key = unsafe { std::str::from_utf8_unchecked(&self.escape_scratch) };
331                if self.is_enum {
332                    self.builder.try_append_value(key)?;
333                } else {
334                    self.builder.append_value(key);
335                }
336            } else {
337                // SAFETY:
338                // just did utf8 check
339                let key = unsafe { std::str::from_utf8_unchecked(bytes) };
340                if self.is_enum {
341                    self.builder.try_append_value(key)?
342                } else {
343                    self.builder.append_value(key)
344                }
345            }
346        } else if ignore_errors {
347            self.builder.append_null()
348        } else {
349            polars_bail!(ComputeError: "invalid utf-8 sequence");
350        }
351        Ok(())
352    }
353}
354
355impl ParsedBuffer for BooleanChunkedBuilder {
356    #[inline]
357    fn parse_bytes(
358        &mut self,
359        bytes: &[u8],
360        ignore_errors: bool,
361        needs_escaping: bool,
362        _missing_is_null: bool,
363        _time_unit: Option<TimeUnit>,
364    ) -> PolarsResult<()> {
365        let bytes = if needs_escaping {
366            &bytes[1..bytes.len() - 1]
367        } else {
368            bytes
369        };
370        if bytes.eq_ignore_ascii_case(b"false") {
371            self.append_value(false);
372        } else if bytes.eq_ignore_ascii_case(b"true") {
373            self.append_value(true);
374        } else if ignore_errors || bytes.is_empty() {
375            self.append_null();
376        } else {
377            polars_bail!(
378                ComputeError: "error while parsing value {} as boolean",
379                String::from_utf8_lossy(bytes),
380            );
381        }
382        Ok(())
383    }
384}
385
386#[cfg(any(feature = "dtype-datetime", feature = "dtype-date"))]
387pub struct DatetimeField<T: PolarsNumericType> {
388    compiled: Option<DatetimeInfer<T>>,
389    builder: PrimitiveChunkedBuilder<T>,
390}
391
392#[cfg(any(feature = "dtype-datetime", feature = "dtype-date"))]
393impl<T: PolarsNumericType> DatetimeField<T> {
394    fn new(name: PlSmallStr, capacity: usize) -> Self {
395        let builder = PrimitiveChunkedBuilder::<T>::new(name, capacity);
396        Self {
397            compiled: None,
398            builder,
399        }
400    }
401}
402
403#[cfg(any(feature = "dtype-datetime", feature = "dtype-date"))]
404fn slow_datetime_parser<T>(
405    buf: &mut DatetimeField<T>,
406    bytes: &[u8],
407    time_unit: Option<TimeUnit>,
408    ignore_errors: bool,
409) -> PolarsResult<()>
410where
411    T: PolarsNumericType,
412    DatetimeInfer<T>: TryFromWithUnit<Pattern>,
413{
414    let val = if bytes.is_ascii() {
415        // SAFETY:
416        // we just checked it is ascii
417        unsafe { std::str::from_utf8_unchecked(bytes) }
418    } else {
419        match std::str::from_utf8(bytes) {
420            Ok(val) => val,
421            Err(_) => {
422                if ignore_errors {
423                    buf.builder.append_null();
424                    return Ok(());
425                } else {
426                    polars_bail!(ComputeError: "invalid utf-8 sequence");
427                }
428            },
429        }
430    };
431
432    let pattern = match &buf.compiled {
433        Some(compiled) => compiled.pattern,
434        None => match infer_pattern_single(val) {
435            Some(pattern) => pattern,
436            None => {
437                if ignore_errors {
438                    buf.builder.append_null();
439                    return Ok(());
440                } else {
441                    polars_bail!(ComputeError: "could not find a 'date/datetime' pattern for '{}'", val)
442                }
443            },
444        },
445    };
446    match DatetimeInfer::try_from_with_unit(pattern, time_unit) {
447        Ok(mut infer) => {
448            let parsed = infer.parse(val);
449            let Some(parsed) = parsed else {
450                if ignore_errors {
451                    buf.builder.append_null();
452                    return Ok(());
453                } else {
454                    polars_bail!(ComputeError: "could not parse '{}' with pattern '{:?}'", val, pattern)
455                }
456            };
457
458            buf.compiled = Some(infer);
459            buf.builder.append_value(parsed);
460            Ok(())
461        },
462        Err(err) => {
463            if ignore_errors {
464                buf.builder.append_null();
465                Ok(())
466            } else {
467                Err(err)
468            }
469        },
470    }
471}
472
473#[cfg(any(feature = "dtype-datetime", feature = "dtype-date"))]
474impl<T> ParsedBuffer for DatetimeField<T>
475where
476    T: PolarsNumericType,
477    DatetimeInfer<T>: TryFromWithUnit<Pattern> + StrpTimeParser<T::Native>,
478{
479    #[inline]
480    fn parse_bytes(
481        &mut self,
482        mut bytes: &[u8],
483        ignore_errors: bool,
484        needs_escaping: bool,
485        _missing_is_null: bool,
486        time_unit: Option<TimeUnit>,
487    ) -> PolarsResult<()> {
488        if needs_escaping && bytes.len() >= 2 {
489            bytes = &bytes[1..bytes.len() - 1]
490        }
491
492        if bytes.is_empty() {
493            // for types other than string `_missing_is_null` is irrelevant; we always append null
494            self.builder.append_null();
495            return Ok(());
496        }
497
498        match &mut self.compiled {
499            None => slow_datetime_parser(self, bytes, time_unit, ignore_errors),
500            Some(compiled) => {
501                match compiled.parse_bytes(bytes, time_unit) {
502                    Some(parsed) => {
503                        self.builder.append_value(parsed);
504                        Ok(())
505                    },
506                    // fall back on chrono parser
507                    // this is a lot slower, we need to do utf8 checking and use
508                    // the slower parser
509                    None => slow_datetime_parser(self, bytes, time_unit, ignore_errors),
510                }
511            },
512        }
513    }
514}
515
516pub fn init_buffers(
517    projection: &[usize],
518    capacity: usize,
519    schema: &Schema,
520    quote_char: Option<u8>,
521    encoding: CsvEncoding,
522    decimal_comma: bool,
523) -> PolarsResult<Vec<Buffer>> {
524    projection
525        .iter()
526        .map(|&i| {
527            let (name, dtype) = schema.get_at_index(i).unwrap();
528            let name = name.clone();
529            let builder = match dtype {
530                &DataType::Boolean => Buffer::Boolean(BooleanChunkedBuilder::new(name, capacity)),
531                #[cfg(feature = "dtype-i8")]
532                &DataType::Int8 => Buffer::Int8(PrimitiveChunkedBuilder::new(name, capacity)),
533                #[cfg(feature = "dtype-i16")]
534                &DataType::Int16 => Buffer::Int16(PrimitiveChunkedBuilder::new(name, capacity)),
535                &DataType::Int32 => Buffer::Int32(PrimitiveChunkedBuilder::new(name, capacity)),
536                &DataType::Int64 => Buffer::Int64(PrimitiveChunkedBuilder::new(name, capacity)),
537                #[cfg(feature = "dtype-i128")]
538                &DataType::Int128 => Buffer::Int128(PrimitiveChunkedBuilder::new(name, capacity)),
539                #[cfg(feature = "dtype-u8")]
540                &DataType::UInt8 => Buffer::UInt8(PrimitiveChunkedBuilder::new(name, capacity)),
541                #[cfg(feature = "dtype-u16")]
542                &DataType::UInt16 => Buffer::UInt16(PrimitiveChunkedBuilder::new(name, capacity)),
543                &DataType::UInt32 => Buffer::UInt32(PrimitiveChunkedBuilder::new(name, capacity)),
544                &DataType::UInt64 => Buffer::UInt64(PrimitiveChunkedBuilder::new(name, capacity)),
545                &DataType::Float32 => {
546                    if decimal_comma {
547                        Buffer::DecimalFloat32(
548                            PrimitiveChunkedBuilder::new(name, capacity),
549                            Default::default(),
550                        )
551                    } else {
552                        Buffer::Float32(PrimitiveChunkedBuilder::new(name, capacity))
553                    }
554                },
555                &DataType::Float64 => {
556                    if decimal_comma {
557                        Buffer::DecimalFloat64(
558                            PrimitiveChunkedBuilder::new(name, capacity),
559                            Default::default(),
560                        )
561                    } else {
562                        Buffer::Float64(PrimitiveChunkedBuilder::new(name, capacity))
563                    }
564                },
565                &DataType::String => {
566                    Buffer::Utf8(Utf8Field::new(name, capacity, quote_char, encoding))
567                },
568                #[cfg(feature = "dtype-datetime")]
569                DataType::Datetime(time_unit, time_zone) => Buffer::Datetime {
570                    buf: DatetimeField::new(name, capacity),
571                    time_unit: *time_unit,
572                    time_zone: time_zone.clone(),
573                },
574                #[cfg(feature = "dtype-date")]
575                &DataType::Date => Buffer::Date(DatetimeField::new(name, capacity)),
576                #[cfg(feature = "dtype-categorical")]
577                DataType::Categorical(_, ordering) => Buffer::Categorical(CategoricalField::new(
578                    name, capacity, quote_char, *ordering,
579                )),
580                #[cfg(feature = "dtype-categorical")]
581                DataType::Enum(rev_map, _) => {
582                    let Some(rev_map) = rev_map else {
583                        polars_bail!(ComputeError: "enum categories must be set")
584                    };
585                    let cats = rev_map.get_categories();
586                    let mut builder =
587                        CategoricalChunkedBuilder::new(name, capacity, Default::default());
588                    for cat in cats.values_iter() {
589                        builder.register_value(cat);
590                    }
591                    Buffer::Categorical(CategoricalField::new_enum(quote_char, builder))
592                },
593                dt => polars_bail!(
594                    ComputeError: "unsupported data type when reading CSV: {} when reading CSV", dt,
595                ),
596            };
597            Ok(builder)
598        })
599        .collect()
600}
601
602#[allow(clippy::large_enum_variant)]
603pub enum Buffer {
604    Boolean(BooleanChunkedBuilder),
605    #[cfg(feature = "dtype-i8")]
606    Int8(PrimitiveChunkedBuilder<Int8Type>),
607    #[cfg(feature = "dtype-i16")]
608    Int16(PrimitiveChunkedBuilder<Int16Type>),
609    Int32(PrimitiveChunkedBuilder<Int32Type>),
610    Int64(PrimitiveChunkedBuilder<Int64Type>),
611    #[cfg(feature = "dtype-i128")]
612    Int128(PrimitiveChunkedBuilder<Int128Type>),
613    #[cfg(feature = "dtype-u8")]
614    UInt8(PrimitiveChunkedBuilder<UInt8Type>),
615    #[cfg(feature = "dtype-u16")]
616    UInt16(PrimitiveChunkedBuilder<UInt16Type>),
617    UInt32(PrimitiveChunkedBuilder<UInt32Type>),
618    UInt64(PrimitiveChunkedBuilder<UInt64Type>),
619    Float32(PrimitiveChunkedBuilder<Float32Type>),
620    Float64(PrimitiveChunkedBuilder<Float64Type>),
621    /// Stores the Utf8 fields and the total string length seen for that column
622    Utf8(Utf8Field),
623    #[cfg(feature = "dtype-datetime")]
624    Datetime {
625        buf: DatetimeField<Int64Type>,
626        time_unit: TimeUnit,
627        time_zone: Option<TimeZone>,
628    },
629    #[cfg(feature = "dtype-date")]
630    Date(DatetimeField<Int32Type>),
631    #[allow(dead_code)]
632    Categorical(CategoricalField),
633    DecimalFloat32(PrimitiveChunkedBuilder<Float32Type>, Vec<u8>),
634    DecimalFloat64(PrimitiveChunkedBuilder<Float64Type>, Vec<u8>),
635}
636
637impl Buffer {
638    pub fn into_series(self) -> PolarsResult<Series> {
639        let s = match self {
640            Buffer::Boolean(v) => v.finish().into_series(),
641            #[cfg(feature = "dtype-i8")]
642            Buffer::Int8(v) => v.finish().into_series(),
643            #[cfg(feature = "dtype-i16")]
644            Buffer::Int16(v) => v.finish().into_series(),
645            Buffer::Int32(v) => v.finish().into_series(),
646            Buffer::Int64(v) => v.finish().into_series(),
647            #[cfg(feature = "dtype-i128")]
648            Buffer::Int128(v) => v.finish().into_series(),
649            #[cfg(feature = "dtype-u8")]
650            Buffer::UInt8(v) => v.finish().into_series(),
651            #[cfg(feature = "dtype-u16")]
652            Buffer::UInt16(v) => v.finish().into_series(),
653            Buffer::UInt32(v) => v.finish().into_series(),
654            Buffer::UInt64(v) => v.finish().into_series(),
655            Buffer::Float32(v) => v.finish().into_series(),
656            Buffer::Float64(v) => v.finish().into_series(),
657            Buffer::DecimalFloat32(v, _) => v.finish().into_series(),
658            Buffer::DecimalFloat64(v, _) => v.finish().into_series(),
659            #[cfg(feature = "dtype-datetime")]
660            Buffer::Datetime {
661                buf,
662                time_unit,
663                time_zone,
664            } => buf
665                .builder
666                .finish()
667                .into_series()
668                .cast(&DataType::Datetime(time_unit, time_zone))
669                .unwrap(),
670            #[cfg(feature = "dtype-date")]
671            Buffer::Date(v) => v
672                .builder
673                .finish()
674                .into_series()
675                .cast(&DataType::Date)
676                .unwrap(),
677
678            Buffer::Utf8(v) => {
679                let arr = v.mutable.freeze();
680                StringChunked::with_chunk(v.name.clone(), unsafe { arr.to_utf8view_unchecked() })
681                    .into_series()
682            },
683            #[allow(unused_variables)]
684            Buffer::Categorical(buf) => {
685                #[cfg(feature = "dtype-categorical")]
686                {
687                    let ca = buf.builder.finish();
688
689                    if buf.is_enum {
690                        let DataType::Categorical(Some(rev_map), _) = ca.dtype() else {
691                            unreachable!()
692                        };
693                        let idx = ca.physical().clone();
694                        let dtype = DataType::Enum(Some(rev_map.clone()), Default::default());
695
696                        unsafe {
697                            CategoricalChunked::from_cats_and_dtype_unchecked(idx, dtype)
698                                .into_series()
699                        }
700                    } else {
701                        ca.into_series()
702                    }
703                }
704                #[cfg(not(feature = "dtype-categorical"))]
705                {
706                    panic!("activate 'dtype-categorical' feature")
707                }
708            },
709        };
710        Ok(s)
711    }
712
713    pub fn add_null(&mut self, valid: bool) {
714        match self {
715            Buffer::Boolean(v) => v.append_null(),
716            #[cfg(feature = "dtype-i8")]
717            Buffer::Int8(v) => v.append_null(),
718            #[cfg(feature = "dtype-i16")]
719            Buffer::Int16(v) => v.append_null(),
720            Buffer::Int32(v) => v.append_null(),
721            Buffer::Int64(v) => v.append_null(),
722            #[cfg(feature = "dtype-i128")]
723            Buffer::Int128(v) => v.append_null(),
724            #[cfg(feature = "dtype-u8")]
725            Buffer::UInt8(v) => v.append_null(),
726            #[cfg(feature = "dtype-u16")]
727            Buffer::UInt16(v) => v.append_null(),
728            Buffer::UInt32(v) => v.append_null(),
729            Buffer::UInt64(v) => v.append_null(),
730            Buffer::Float32(v) => v.append_null(),
731            Buffer::Float64(v) => v.append_null(),
732            Buffer::DecimalFloat32(v, _) => v.append_null(),
733            Buffer::DecimalFloat64(v, _) => v.append_null(),
734            Buffer::Utf8(v) => {
735                if valid {
736                    v.mutable.push_value("")
737                } else {
738                    v.mutable.push_null()
739                }
740            },
741            #[cfg(feature = "dtype-datetime")]
742            Buffer::Datetime { buf, .. } => buf.builder.append_null(),
743            #[cfg(feature = "dtype-date")]
744            Buffer::Date(v) => v.builder.append_null(),
745            #[allow(unused_variables)]
746            Buffer::Categorical(cat_builder) => {
747                #[cfg(feature = "dtype-categorical")]
748                {
749                    cat_builder.builder.append_null()
750                }
751                #[cfg(not(feature = "dtype-categorical"))]
752                {
753                    panic!("activate 'dtype-categorical' feature")
754                }
755            },
756        };
757    }
758
759    pub fn dtype(&self) -> DataType {
760        match self {
761            Buffer::Boolean(_) => DataType::Boolean,
762            #[cfg(feature = "dtype-i8")]
763            Buffer::Int8(_) => DataType::Int8,
764            #[cfg(feature = "dtype-i16")]
765            Buffer::Int16(_) => DataType::Int16,
766            Buffer::Int32(_) => DataType::Int32,
767            Buffer::Int64(_) => DataType::Int64,
768            #[cfg(feature = "dtype-i128")]
769            Buffer::Int128(_) => DataType::Int128,
770            #[cfg(feature = "dtype-u8")]
771            Buffer::UInt8(_) => DataType::UInt8,
772            #[cfg(feature = "dtype-u16")]
773            Buffer::UInt16(_) => DataType::UInt16,
774            Buffer::UInt32(_) => DataType::UInt32,
775            Buffer::UInt64(_) => DataType::UInt64,
776            Buffer::Float32(_) | Buffer::DecimalFloat32(_, _) => DataType::Float32,
777            Buffer::Float64(_) | Buffer::DecimalFloat64(_, _) => DataType::Float64,
778            Buffer::Utf8(_) => DataType::String,
779            #[cfg(feature = "dtype-datetime")]
780            Buffer::Datetime { time_unit, .. } => DataType::Datetime(*time_unit, None),
781            #[cfg(feature = "dtype-date")]
782            Buffer::Date(_) => DataType::Date,
783            Buffer::Categorical(_) => {
784                #[cfg(feature = "dtype-categorical")]
785                {
786                    DataType::Categorical(None, Default::default())
787                }
788
789                #[cfg(not(feature = "dtype-categorical"))]
790                {
791                    panic!("activate 'dtype-categorical' feature")
792                }
793            },
794        }
795    }
796
797    #[inline]
798    pub fn add(
799        &mut self,
800        bytes: &[u8],
801        ignore_errors: bool,
802        needs_escaping: bool,
803        missing_is_null: bool,
804    ) -> PolarsResult<()> {
805        use Buffer::*;
806        match self {
807            Boolean(buf) => <BooleanChunkedBuilder as ParsedBuffer>::parse_bytes(
808                buf,
809                bytes,
810                ignore_errors,
811                needs_escaping,
812                missing_is_null,
813                None,
814            ),
815            #[cfg(feature = "dtype-i8")]
816            Int8(buf) => <PrimitiveChunkedBuilder<Int8Type> as ParsedBuffer>::parse_bytes(
817                buf,
818                bytes,
819                ignore_errors,
820                needs_escaping,
821                missing_is_null,
822                None,
823            ),
824            #[cfg(feature = "dtype-i16")]
825            Int16(buf) => <PrimitiveChunkedBuilder<Int16Type> as ParsedBuffer>::parse_bytes(
826                buf,
827                bytes,
828                ignore_errors,
829                needs_escaping,
830                missing_is_null,
831                None,
832            ),
833            Int32(buf) => <PrimitiveChunkedBuilder<Int32Type> as ParsedBuffer>::parse_bytes(
834                buf,
835                bytes,
836                ignore_errors,
837                needs_escaping,
838                missing_is_null,
839                None,
840            ),
841            Int64(buf) => <PrimitiveChunkedBuilder<Int64Type> as ParsedBuffer>::parse_bytes(
842                buf,
843                bytes,
844                ignore_errors,
845                needs_escaping,
846                missing_is_null,
847                None,
848            ),
849            #[cfg(feature = "dtype-i128")]
850            Int128(buf) => <PrimitiveChunkedBuilder<Int128Type> as ParsedBuffer>::parse_bytes(
851                buf,
852                bytes,
853                ignore_errors,
854                needs_escaping,
855                missing_is_null,
856                None,
857            ),
858            #[cfg(feature = "dtype-u8")]
859            UInt8(buf) => <PrimitiveChunkedBuilder<UInt8Type> as ParsedBuffer>::parse_bytes(
860                buf,
861                bytes,
862                ignore_errors,
863                needs_escaping,
864                missing_is_null,
865                None,
866            ),
867            #[cfg(feature = "dtype-u16")]
868            UInt16(buf) => <PrimitiveChunkedBuilder<UInt16Type> as ParsedBuffer>::parse_bytes(
869                buf,
870                bytes,
871                ignore_errors,
872                needs_escaping,
873                missing_is_null,
874                None,
875            ),
876            UInt32(buf) => <PrimitiveChunkedBuilder<UInt32Type> as ParsedBuffer>::parse_bytes(
877                buf,
878                bytes,
879                ignore_errors,
880                needs_escaping,
881                missing_is_null,
882                None,
883            ),
884            UInt64(buf) => <PrimitiveChunkedBuilder<UInt64Type> as ParsedBuffer>::parse_bytes(
885                buf,
886                bytes,
887                ignore_errors,
888                needs_escaping,
889                missing_is_null,
890                None,
891            ),
892            Float32(buf) => <PrimitiveChunkedBuilder<Float32Type> as ParsedBuffer>::parse_bytes(
893                buf,
894                bytes,
895                ignore_errors,
896                needs_escaping,
897                missing_is_null,
898                None,
899            ),
900            Float64(buf) => <PrimitiveChunkedBuilder<Float64Type> as ParsedBuffer>::parse_bytes(
901                buf,
902                bytes,
903                ignore_errors,
904                needs_escaping,
905                missing_is_null,
906                None,
907            ),
908            DecimalFloat32(buf, scratch) => {
909                prepare_decimal_comma(bytes, scratch);
910                <PrimitiveChunkedBuilder<Float32Type> as ParsedBuffer>::parse_bytes(
911                    buf,
912                    scratch,
913                    ignore_errors,
914                    needs_escaping,
915                    missing_is_null,
916                    None,
917                )
918            },
919            DecimalFloat64(buf, scratch) => {
920                prepare_decimal_comma(bytes, scratch);
921                <PrimitiveChunkedBuilder<Float64Type> as ParsedBuffer>::parse_bytes(
922                    buf,
923                    scratch,
924                    ignore_errors,
925                    needs_escaping,
926                    missing_is_null,
927                    None,
928                )
929            },
930            Utf8(buf) => <Utf8Field as ParsedBuffer>::parse_bytes(
931                buf,
932                bytes,
933                ignore_errors,
934                needs_escaping,
935                missing_is_null,
936                None,
937            ),
938            #[cfg(feature = "dtype-datetime")]
939            Datetime { buf, time_unit, .. } => {
940                <DatetimeField<Int64Type> as ParsedBuffer>::parse_bytes(
941                    buf,
942                    bytes,
943                    ignore_errors,
944                    needs_escaping,
945                    missing_is_null,
946                    Some(*time_unit),
947                )
948            },
949            #[cfg(feature = "dtype-date")]
950            Date(buf) => <DatetimeField<Int32Type> as ParsedBuffer>::parse_bytes(
951                buf,
952                bytes,
953                ignore_errors,
954                needs_escaping,
955                missing_is_null,
956                None,
957            ),
958            #[allow(unused_variables)]
959            Categorical(buf) => {
960                #[cfg(feature = "dtype-categorical")]
961                {
962                    buf.parse_bytes(bytes, ignore_errors, needs_escaping, missing_is_null, None)
963                }
964
965                #[cfg(not(feature = "dtype-categorical"))]
966                {
967                    panic!("activate 'dtype-categorical' feature")
968                }
969            },
970        }
971    }
972}
973
974#[inline]
975fn prepare_decimal_comma(bytes: &[u8], scratch: &mut Vec<u8>) {
976    scratch.clear();
977    scratch.reserve(bytes.len());
978
979    // SAFETY: we pre-allocated.
980    for &byte in bytes {
981        if byte == b',' {
982            unsafe { scratch.push_unchecked(b'.') }
983        } else {
984            unsafe { scratch.push_unchecked(byte) }
985        }
986    }
987}