polars_io/csv/write/write_impl/
serializer.rs

1//! This file is complicated because we have complicated escape handling. We want to avoid having
2//! to write down each combination of type & escaping, but we also want the compiler to optimize them
3//! to efficient machine code - so no dynamic dispatch. That means a lot of generics and macros.
4//!
5//! We need to differentiate between several kinds of types, and several kinds of escaping we support:
6//!
7//!  - The simplest escaping mechanism are [`QuoteStyle::Always`] and [`QuoteStyle::Never`].
8//!    For `Never` we just never quote. For `Always` we pass any serializer that never quotes
9//!    to [`quote_serializer()`] then it becomes quoted properly.
10//!  - [`QuoteStyle::Necessary`] (the default) is only relevant for strings and floats with decimal_comma,
11//!    as these are the only types that can have newlines (row separators), commas (default column separators)
12//!    or quotes. String escaping is complicated anyway, and it is all inside [`string_serializer()`].
13//!  - The real complication is [`QuoteStyle::NonNumeric`], that doesn't quote numbers (unless necessary)
14//!    and nulls, and quotes any other thing. The problem is that nulls can be within any type, so we
15//!    need to handle two possibilities of quoting everywhere.
16//!
17//! So in case the chosen style is anything but `NonNumeric`, we statically know for each column except strings
18//! whether it should be quoted (and for strings too when not `Necessary`). There we use
19//! `quote_serializer()` or nothing.
20//!
21//! But to help with `NonNumeric`, each serializer carry the potential to distinguish between nulls and non-nulls,
22//! and quote the latter and not the former. But in order to not have the branch when we statically know the answer,
23//! we have an option to statically disable it with a const generic flag `QUOTE_NON_NULL`. Numbers (that should never
24//! be quoted with `NonNumeric`) just always disable this flag.
25//!
26//! So we have three possibilities:
27//!
28//!  1. A serializer that never quotes. This is a bare serializer with `QUOTE_NON_NULL = false`.
29//!  2. A serializer that always quotes. This is a serializer wrapped with `quote_serializer()`,
30//!     but also with `QUOTE_NON_NULL = false`.
31//!  3. A serializer that quotes only non-nulls. This is a bare serializer with `QUOTE_NON_NULL = true`.
32
33use std::fmt::LowerExp;
34use std::io::Write;
35
36use arrow::array::{Array, BooleanArray, Float16Array, NullArray, PrimitiveArray, Utf8ViewArray};
37use arrow::legacy::time_zone::Tz;
38use arrow::types::NativeType;
39#[cfg(feature = "timezones")]
40use chrono::TimeZone;
41use memchr::{memchr_iter, memchr3};
42use num_traits::NumCast;
43use polars_core::prelude::*;
44use polars_utils::float16::pf16;
45
46use crate::csv::write::{QuoteStyle, SerializeOptions};
47
48const TOO_MANY_MSG: &str = "too many items requested from CSV serializer";
49const ARRAY_MISMATCH_MSG: &str = "wrong array type";
50
51#[allow(dead_code)]
52struct IgnoreFmt;
53impl std::fmt::Write for IgnoreFmt {
54    fn write_str(&mut self, _s: &str) -> std::fmt::Result {
55        Ok(())
56    }
57}
58
59pub(super) trait Serializer<'a> {
60    fn serialize(&mut self, buf: &mut Vec<u8>, options: &SerializeOptions);
61    // Updates the array without changing the configuration.
62    fn update_array(&mut self, array: &'a dyn Array);
63}
64
65fn make_serializer<'a, T, I: Iterator<Item = Option<T>>, const QUOTE_NON_NULL: bool>(
66    f: impl FnMut(T, &mut Vec<u8>, &SerializeOptions),
67    iter: I,
68    update_array: impl FnMut(&'a dyn Array) -> I,
69) -> impl Serializer<'a> {
70    struct SerializerImpl<F, I, Update, const QUOTE_NON_NULL: bool> {
71        f: F,
72        iter: I,
73        update_array: Update,
74    }
75
76    impl<'a, T, F, I, Update, const QUOTE_NON_NULL: bool> Serializer<'a>
77        for SerializerImpl<F, I, Update, QUOTE_NON_NULL>
78    where
79        F: FnMut(T, &mut Vec<u8>, &SerializeOptions),
80        I: Iterator<Item = Option<T>>,
81        Update: FnMut(&'a dyn Array) -> I,
82    {
83        fn serialize(&mut self, buf: &mut Vec<u8>, options: &SerializeOptions) {
84            let item = self.iter.next().expect(TOO_MANY_MSG);
85            match item {
86                Some(item) => {
87                    if QUOTE_NON_NULL {
88                        buf.push(options.quote_char);
89                    }
90                    (self.f)(item, buf, options);
91                    if QUOTE_NON_NULL {
92                        buf.push(options.quote_char);
93                    }
94                },
95                None => buf.extend_from_slice(options.null.as_bytes()),
96            }
97        }
98
99        fn update_array(&mut self, array: &'a dyn Array) {
100            self.iter = (self.update_array)(array);
101        }
102    }
103
104    SerializerImpl::<_, _, _, QUOTE_NON_NULL> {
105        f,
106        iter,
107        update_array,
108    }
109}
110
111fn integer_serializer<I: NativeType + itoa::Integer>(
112    array: &PrimitiveArray<I>,
113) -> impl Serializer<'_> {
114    let f = move |&item, buf: &mut Vec<u8>, _options: &SerializeOptions| {
115        let mut buffer = itoa::Buffer::new();
116        let value = buffer.format(item);
117        buf.extend_from_slice(value.as_bytes());
118    };
119
120    make_serializer::<_, _, false>(f, array.iter(), |array| {
121        array
122            .as_any()
123            .downcast_ref::<PrimitiveArray<I>>()
124            .expect(ARRAY_MISMATCH_MSG)
125            .iter()
126    })
127}
128
129fn float_serializer_no_precision_autoformat_f16(array: &Float16Array) -> impl Serializer<'_> {
130    let f = move |&item, buf: &mut Vec<u8>, _options: &SerializeOptions| {
131        let mut buffer = ryu::Buffer::new();
132        let cast: f32 = NumCast::from(item).unwrap();
133        let value = buffer.format(cast);
134        buf.extend_from_slice(value.as_bytes());
135    };
136    float_serializer_no_precision_autoformat_(array, f)
137}
138
139fn float_serializer_no_precision_autoformat<I: NativeType + ryu::Float>(
140    array: &PrimitiveArray<I>,
141) -> impl Serializer<'_> {
142    let f = move |&item, buf: &mut Vec<u8>, _options: &SerializeOptions| {
143        let mut buffer = ryu::Buffer::new();
144        let value = buffer.format(item);
145        buf.extend_from_slice(value.as_bytes());
146    };
147    float_serializer_no_precision_autoformat_(array, f)
148}
149
150fn float_serializer_no_precision_autoformat_<
151    'a,
152    I: NativeType,
153    F: Fn(&'a I, &mut Vec<u8>, &SerializeOptions),
154>(
155    array: &'a PrimitiveArray<I>,
156    f: F,
157) -> impl Serializer<'a> {
158    make_serializer::<_, _, false>(f, array.iter(), |array| {
159        array
160            .as_any()
161            .downcast_ref::<PrimitiveArray<I>>()
162            .expect(ARRAY_MISMATCH_MSG)
163            .iter()
164    })
165}
166
167fn float_serializer_no_precision_autoformat_decimal_comma_f16(
168    array: &Float16Array,
169) -> impl Serializer<'_> {
170    let f = move |&item, buf: &mut Vec<u8>, _options: &SerializeOptions| {
171        let mut buffer = ryu::Buffer::new();
172        let cast: f32 = NumCast::from(item).unwrap();
173        let value = buffer.format(cast);
174
175        for ch in value.as_bytes() {
176            buf.push(if *ch == b'.' { b',' } else { *ch });
177        }
178    };
179    float_serializer_no_precision_autoformat_decimal_comma_(array, f)
180}
181
182fn float_serializer_no_precision_autoformat_decimal_comma<I: NativeType + ryu::Float>(
183    array: &PrimitiveArray<I>,
184) -> impl Serializer<'_> {
185    let f = move |&item, buf: &mut Vec<u8>, _options: &SerializeOptions| {
186        let mut buffer = ryu::Buffer::new();
187        let value = buffer.format(item).as_bytes();
188
189        for ch in value {
190            buf.push(if *ch == b'.' { b',' } else { *ch });
191        }
192    };
193    float_serializer_no_precision_autoformat_decimal_comma_(array, f)
194}
195
196fn float_serializer_no_precision_autoformat_decimal_comma_<
197    'a,
198    I: NativeType,
199    F: Fn(&'a I, &mut Vec<u8>, &SerializeOptions),
200>(
201    array: &'a PrimitiveArray<I>,
202    f: F,
203) -> impl Serializer<'a> {
204    make_serializer::<_, _, false>(f, array.iter(), |array| {
205        array
206            .as_any()
207            .downcast_ref::<PrimitiveArray<I>>()
208            .expect(ARRAY_MISMATCH_MSG)
209            .iter()
210    })
211}
212
213fn float_serializer_no_precision_scientific<I: NativeType + LowerExp>(
214    array: &PrimitiveArray<I>,
215) -> impl Serializer<'_> {
216    let f = move |&item, buf: &mut Vec<u8>, _options: &SerializeOptions| {
217        // Float writing into a buffer of `Vec<u8>` cannot fail.
218        let _ = write!(buf, "{item:.e}");
219    };
220
221    make_serializer::<_, _, false>(f, array.iter(), |array| {
222        array
223            .as_any()
224            .downcast_ref::<PrimitiveArray<I>>()
225            .expect(ARRAY_MISMATCH_MSG)
226            .iter()
227    })
228}
229
230fn float_serializer_no_precision_scientific_decimal_comma<I: NativeType + LowerExp>(
231    array: &PrimitiveArray<I>,
232) -> impl Serializer<'_> {
233    let mut scratch = Vec::new();
234
235    let f = move |&item, buf: &mut Vec<u8>, _options: &SerializeOptions| {
236        // Float writing into a buffer of `Vec<u8>` cannot fail.
237        let _ = write!(&mut scratch, "{item:.e}");
238        for c in &mut scratch {
239            if *c == b'.' {
240                *c = b',';
241                break;
242            }
243        }
244        buf.extend_from_slice(&scratch);
245    };
246
247    make_serializer::<_, _, false>(f, array.iter(), |array| {
248        array
249            .as_any()
250            .downcast_ref::<PrimitiveArray<I>>()
251            .expect(ARRAY_MISMATCH_MSG)
252            .iter()
253    })
254}
255
256fn float_serializer_no_precision_positional<I: NativeType + NumCast>(
257    array: &PrimitiveArray<I>,
258) -> impl Serializer<'_> {
259    let f = move |&item, buf: &mut Vec<u8>, _options: &SerializeOptions| {
260        let v: f64 = NumCast::from(item).unwrap();
261        let _ = write!(buf, "{v}");
262    };
263
264    make_serializer::<_, _, false>(f, array.iter(), |array| {
265        array
266            .as_any()
267            .downcast_ref::<PrimitiveArray<I>>()
268            .expect(ARRAY_MISMATCH_MSG)
269            .iter()
270    })
271}
272
273fn float_serializer_no_precision_positional_decimal_comma<I: NativeType + NumCast>(
274    array: &PrimitiveArray<I>,
275) -> impl Serializer<'_> {
276    let mut scratch = Vec::new();
277
278    let f = move |&item, buf: &mut Vec<u8>, _options: &SerializeOptions| {
279        scratch.clear();
280        let v: f64 = NumCast::from(item).unwrap();
281        let _ = write!(&mut scratch, "{v}");
282        for c in &mut scratch {
283            if *c == b'.' {
284                *c = b',';
285                break;
286            }
287        }
288        buf.extend_from_slice(&scratch);
289    };
290
291    make_serializer::<_, _, false>(f, array.iter(), |array| {
292        array
293            .as_any()
294            .downcast_ref::<PrimitiveArray<I>>()
295            .expect(ARRAY_MISMATCH_MSG)
296            .iter()
297    })
298}
299
300fn float_serializer_with_precision_scientific<I: NativeType + LowerExp>(
301    array: &PrimitiveArray<I>,
302    precision: usize,
303) -> impl Serializer<'_> {
304    let f = move |&item, buf: &mut Vec<u8>, _options: &SerializeOptions| {
305        // Float writing into a buffer of `Vec<u8>` cannot fail.
306        let _ = write!(buf, "{item:.precision$e}");
307    };
308
309    make_serializer::<_, _, false>(f, array.iter(), |array| {
310        array
311            .as_any()
312            .downcast_ref::<PrimitiveArray<I>>()
313            .expect(ARRAY_MISMATCH_MSG)
314            .iter()
315    })
316}
317
318fn float_serializer_with_precision_scientific_decimal_comma<I: NativeType + LowerExp>(
319    array: &PrimitiveArray<I>,
320    precision: usize,
321) -> impl Serializer<'_> {
322    let mut scratch = Vec::new();
323
324    let f = move |&item, buf: &mut Vec<u8>, _options: &SerializeOptions| {
325        scratch.clear();
326        // Float writing into a buffer of `Vec<u8>` cannot fail.
327        let _ = write!(&mut scratch, "{item:.precision$e}");
328        for c in &mut scratch {
329            if *c == b'.' {
330                *c = b',';
331                break;
332            }
333        }
334        buf.extend_from_slice(&scratch);
335    };
336
337    make_serializer::<_, _, false>(f, array.iter(), |array| {
338        array
339            .as_any()
340            .downcast_ref::<PrimitiveArray<I>>()
341            .expect(ARRAY_MISMATCH_MSG)
342            .iter()
343    })
344}
345
346fn float_serializer_with_precision_positional<I: NativeType>(
347    array: &PrimitiveArray<I>,
348    precision: usize,
349) -> impl Serializer<'_> {
350    let f = move |&item, buf: &mut Vec<u8>, _options: &SerializeOptions| {
351        // Float writing into a buffer of `Vec<u8>` cannot fail.
352        let _ = write!(buf, "{item:.precision$}");
353    };
354
355    make_serializer::<_, _, false>(f, array.iter(), |array| {
356        array
357            .as_any()
358            .downcast_ref::<PrimitiveArray<I>>()
359            .expect(ARRAY_MISMATCH_MSG)
360            .iter()
361    })
362}
363
364fn float_serializer_with_precision_positional_decimal_comma<I: NativeType>(
365    array: &PrimitiveArray<I>,
366    precision: usize,
367) -> impl Serializer<'_> {
368    let mut scratch = Vec::new();
369
370    let f = move |&item, buf: &mut Vec<u8>, _options: &SerializeOptions| {
371        scratch.clear();
372        let _ = write!(&mut scratch, "{item:.precision$}");
373        for c in &mut scratch {
374            if *c == b'.' {
375                *c = b',';
376                break;
377            }
378        }
379        buf.extend_from_slice(&scratch);
380    };
381
382    make_serializer::<_, _, false>(f, array.iter(), |array| {
383        array
384            .as_any()
385            .downcast_ref::<PrimitiveArray<I>>()
386            .expect(ARRAY_MISMATCH_MSG)
387            .iter()
388    })
389}
390
391fn null_serializer(_array: &NullArray) -> impl Serializer<'_> {
392    struct NullSerializer;
393    impl<'a> Serializer<'a> for NullSerializer {
394        fn serialize(&mut self, buf: &mut Vec<u8>, options: &SerializeOptions) {
395            buf.extend_from_slice(options.null.as_bytes());
396        }
397        fn update_array(&mut self, _array: &'a dyn Array) {}
398    }
399    NullSerializer
400}
401
402fn bool_serializer<const QUOTE_NON_NULL: bool>(array: &BooleanArray) -> impl Serializer<'_> {
403    let f = move |item, buf: &mut Vec<u8>, _options: &SerializeOptions| {
404        let s = if item { "true" } else { "false" };
405        buf.extend_from_slice(s.as_bytes());
406    };
407
408    make_serializer::<_, _, QUOTE_NON_NULL>(f, array.iter(), |array| {
409        array
410            .as_any()
411            .downcast_ref::<BooleanArray>()
412            .expect(ARRAY_MISMATCH_MSG)
413            .iter()
414    })
415}
416
417#[cfg(feature = "dtype-decimal")]
418fn decimal_serializer(array: &PrimitiveArray<i128>, scale: usize) -> impl Serializer<'_> {
419    let trim_zeros = arrow::compute::decimal::get_trim_decimal_zeros();
420
421    let mut fmt_buf = polars_compute::decimal::DecimalFmtBuffer::new();
422    let f = move |&item, buf: &mut Vec<u8>, options: &SerializeOptions| {
423        buf.extend_from_slice(
424            fmt_buf
425                .format_dec128(item, scale, trim_zeros, options.decimal_comma)
426                .as_bytes(),
427        );
428    };
429
430    make_serializer::<_, _, false>(f, array.iter(), |array| {
431        array
432            .as_any()
433            .downcast_ref::<PrimitiveArray<i128>>()
434            .expect(ARRAY_MISMATCH_MSG)
435            .iter()
436    })
437}
438
439#[cfg(any(
440    feature = "dtype-date",
441    feature = "dtype-time",
442    feature = "dtype-datetime"
443))]
444fn callback_serializer<'a, T: NativeType, const QUOTE_NON_NULL: bool>(
445    array: &'a PrimitiveArray<T>,
446    mut callback: impl FnMut(T, &mut Vec<u8>) + 'a,
447) -> impl Serializer<'a> {
448    let f = move |&item, buf: &mut Vec<u8>, _options: &SerializeOptions| {
449        callback(item, buf);
450    };
451
452    make_serializer::<_, _, QUOTE_NON_NULL>(f, array.iter(), |array| {
453        array
454            .as_any()
455            .downcast_ref::<PrimitiveArray<T>>()
456            .expect(ARRAY_MISMATCH_MSG)
457            .iter()
458    })
459}
460
461#[cfg(any(feature = "dtype-date", feature = "dtype-time"))]
462type ChronoFormatIter<'a, 'b> = std::slice::Iter<'a, chrono::format::Item<'b>>;
463
464#[cfg(any(feature = "dtype-date", feature = "dtype-time"))]
465fn date_and_time_serializer<'a, Underlying: NativeType, T: std::fmt::Display>(
466    format_str: &'a Option<String>,
467    description: &str,
468    array: &'a dyn Array,
469    sample_value: T,
470    mut convert: impl FnMut(Underlying) -> T + Send + 'a,
471    mut format_fn: impl for<'b> FnMut(
472        &T,
473        ChronoFormatIter<'b, 'a>,
474    ) -> chrono::format::DelayedFormat<ChronoFormatIter<'b, 'a>>
475    + Send
476    + 'a,
477    options: &SerializeOptions,
478) -> PolarsResult<Box<dyn Serializer<'a> + Send + 'a>> {
479    let array = array.as_any().downcast_ref().unwrap();
480    let serializer = match format_str {
481        Some(format_str) => {
482            let format = chrono::format::StrftimeItems::new(format_str).parse().map_err(
483                |_| polars_err!(ComputeError: "cannot format {description} with format '{format_str}'"),
484            )?;
485            use std::fmt::Write;
486            // Fail fast for invalid format. This return error faster to the user, and allows us to not return
487            // `Result` from `serialize()`.
488            write!(IgnoreFmt, "{}", format_fn(&sample_value, format.iter())).map_err(
489                |_| polars_err!(ComputeError: "cannot format {description} with format '{format_str}'"),
490            )?;
491            let callback = move |item, buf: &mut Vec<u8>| {
492                let item = convert(item);
493                // We checked the format is valid above.
494                let _ = write!(buf, "{}", format_fn(&item, format.iter()));
495            };
496            date_and_time_final_serializer(array, callback, options)
497        },
498        None => {
499            let callback = move |item, buf: &mut Vec<u8>| {
500                let item = convert(item);
501                // Formatting dates into `Vec<u8>` cannot fail.
502                let _ = write!(buf, "{item}");
503            };
504            date_and_time_final_serializer(array, callback, options)
505        },
506    };
507    Ok(serializer)
508}
509
510#[cfg(any(
511    feature = "dtype-date",
512    feature = "dtype-time",
513    feature = "dtype-datetime"
514))]
515fn date_and_time_final_serializer<'a, T: NativeType>(
516    array: &'a PrimitiveArray<T>,
517    callback: impl FnMut(T, &mut Vec<u8>) + Send + 'a,
518    options: &SerializeOptions,
519) -> Box<dyn Serializer<'a> + Send + 'a> {
520    match options.quote_style {
521        QuoteStyle::Always => Box::new(quote_serializer(callback_serializer::<T, false>(
522            array, callback,
523        ))) as Box<dyn Serializer + Send>,
524        QuoteStyle::NonNumeric => Box::new(callback_serializer::<T, true>(array, callback)),
525        _ => Box::new(callback_serializer::<T, false>(array, callback)),
526    }
527}
528
529pub(super) fn string_serializer<'a, Iter: Send + 'a>(
530    mut f: impl FnMut(&mut Iter) -> Option<&str> + Send + 'a,
531    options: &SerializeOptions,
532    mut update: impl FnMut(&'a dyn Array) -> Iter + Send + 'a,
533    array: &'a dyn Array,
534) -> Box<dyn Serializer<'a> + 'a + Send> {
535    const LF: u8 = b'\n';
536    const CR: u8 = b'\r';
537
538    struct StringSerializer<F, Iter, Update> {
539        serialize: F,
540        update: Update,
541        iter: Iter,
542    }
543
544    impl<'a, F, Iter, Update> Serializer<'a> for StringSerializer<F, Iter, Update>
545    where
546        F: FnMut(&mut Iter, &mut Vec<u8>, &SerializeOptions),
547        Update: FnMut(&'a dyn Array) -> Iter,
548    {
549        fn serialize(&mut self, buf: &mut Vec<u8>, options: &SerializeOptions) {
550            (self.serialize)(&mut self.iter, buf, options);
551        }
552
553        fn update_array(&mut self, array: &'a dyn Array) {
554            self.iter = (self.update)(array);
555        }
556    }
557
558    fn serialize_str_escaped(buf: &mut Vec<u8>, s: &[u8], quote_char: u8, quoted: bool) {
559        let mut iter = memchr_iter(quote_char, s);
560        let first_quote = iter.next();
561        match first_quote {
562            None => buf.extend_from_slice(s),
563            Some(mut quote_pos) => {
564                if !quoted {
565                    buf.push(quote_char);
566                }
567                let mut start_pos = 0;
568                loop {
569                    buf.extend_from_slice(&s[start_pos..quote_pos]);
570                    buf.extend_from_slice(&[quote_char, quote_char]);
571                    match iter.next() {
572                        Some(quote) => {
573                            start_pos = quote_pos + 1;
574                            quote_pos = quote;
575                        },
576                        None => {
577                            buf.extend_from_slice(&s[quote_pos + 1..]);
578                            break;
579                        },
580                    }
581                }
582                if !quoted {
583                    buf.push(quote_char);
584                }
585            },
586        }
587    }
588
589    let iter = update(array);
590    match options.quote_style {
591        QuoteStyle::Always => {
592            let serialize =
593                move |iter: &mut Iter, buf: &mut Vec<u8>, options: &SerializeOptions| {
594                    let quote_char = options.quote_char;
595                    buf.push(quote_char);
596                    let Some(s) = f(iter) else {
597                        buf.extend_from_slice(options.null.as_bytes());
598                        buf.push(quote_char);
599                        return;
600                    };
601                    serialize_str_escaped(buf, s.as_bytes(), quote_char, true);
602                    buf.push(quote_char);
603                };
604            Box::new(StringSerializer {
605                serialize,
606                update,
607                iter,
608            })
609        },
610        QuoteStyle::NonNumeric => {
611            let serialize =
612                move |iter: &mut Iter, buf: &mut Vec<u8>, options: &SerializeOptions| {
613                    let Some(s) = f(iter) else {
614                        buf.extend_from_slice(options.null.as_bytes());
615                        return;
616                    };
617                    let quote_char = options.quote_char;
618                    buf.push(quote_char);
619                    serialize_str_escaped(buf, s.as_bytes(), quote_char, true);
620                    buf.push(quote_char);
621                };
622            Box::new(StringSerializer {
623                serialize,
624                update,
625                iter,
626            })
627        },
628        QuoteStyle::Necessary => {
629            let serialize =
630                move |iter: &mut Iter, buf: &mut Vec<u8>, options: &SerializeOptions| {
631                    let Some(s) = f(iter) else {
632                        buf.extend_from_slice(options.null.as_bytes());
633                        return;
634                    };
635                    let quote_char = options.quote_char;
636                    // An empty string conflicts with null, so it is necessary to quote.
637                    if s.is_empty() {
638                        buf.extend_from_slice(&[quote_char, quote_char]);
639                        return;
640                    }
641                    let needs_quote = memchr3(options.separator, LF, CR, s.as_bytes()).is_some();
642                    if needs_quote {
643                        buf.push(quote_char);
644                    }
645                    serialize_str_escaped(buf, s.as_bytes(), quote_char, needs_quote);
646                    if needs_quote {
647                        buf.push(quote_char);
648                    }
649                };
650            Box::new(StringSerializer {
651                serialize,
652                update,
653                iter,
654            })
655        },
656        QuoteStyle::Never => {
657            let serialize =
658                move |iter: &mut Iter, buf: &mut Vec<u8>, options: &SerializeOptions| {
659                    let Some(s) = f(iter) else {
660                        buf.extend_from_slice(options.null.as_bytes());
661                        return;
662                    };
663                    buf.extend_from_slice(s.as_bytes());
664                };
665            Box::new(StringSerializer {
666                serialize,
667                update,
668                iter,
669            })
670        },
671    }
672}
673
674fn quote_serializer<'a>(serializer: impl Serializer<'a>) -> impl Serializer<'a> {
675    struct QuoteSerializer<S>(S);
676    impl<'a, S: Serializer<'a>> Serializer<'a> for QuoteSerializer<S> {
677        fn serialize(&mut self, buf: &mut Vec<u8>, options: &SerializeOptions) {
678            buf.push(options.quote_char);
679            self.0.serialize(buf, options);
680            buf.push(options.quote_char);
681        }
682
683        fn update_array(&mut self, array: &'a dyn Array) {
684            self.0.update_array(array);
685        }
686    }
687    QuoteSerializer(serializer)
688}
689
690pub(super) fn serializer_for<'a>(
691    array: &'a dyn Array,
692    options: &'a SerializeOptions,
693    dtype: &'a DataType,
694    _datetime_format: &'a str,
695    _time_zone: Option<Tz>,
696) -> PolarsResult<Box<dyn Serializer<'a> + Send + 'a>> {
697    // The needs_quotes flag captures the quote logic for the quote_wrapper! macro
698    // It is targeted at numerical types primarily; other types may required additional logic
699    let needs_quotes = match dtype {
700        DataType::Float16 | DataType::Float32 | DataType::Float64 => {
701            // When comma is used as both the field separator and decimal separator, quoting
702            // may be required. Specifically, when:
703            // - quote_style is Always, or
704            // - quote_style is Necessary or Non-Numeric, the field separator is also a comma,
705            //   and the float string field contains a comma character (no precision or precision > 0)
706            //
707            // In some rare cases, a field may get quoted when it is not strictly necessary
708            // (e.g., in scientific notation when only the first digit is non-zero such as '1e12',
709            // or null values in 'non_numeric' quote_style).
710
711            let mut should_quote = options.decimal_comma && options.separator == b',';
712            if let Some(precision) = options.float_precision {
713                should_quote &= precision > 0;
714            }
715
716            match options.quote_style {
717                QuoteStyle::Always => true,
718                QuoteStyle::Necessary | QuoteStyle::NonNumeric => should_quote,
719                QuoteStyle::Never => false,
720            }
721        },
722        #[cfg(feature = "dtype-decimal")]
723        DataType::Decimal(_, scale) => {
724            // Similar to logic for float data-types, but need to consider scale rather than precision
725            let should_quote = options.decimal_comma && options.separator == b',' && *scale > 0;
726
727            match options.quote_style {
728                QuoteStyle::Always => true,
729                QuoteStyle::Necessary | QuoteStyle::NonNumeric => should_quote,
730                QuoteStyle::Never => false,
731            }
732        },
733        _ => options.quote_style == QuoteStyle::Always,
734    };
735
736    macro_rules! quote_wrapper {
737        ($make_serializer:path, $($arg:tt)*) => {{
738            let serializer = $make_serializer(array.as_any().downcast_ref().unwrap(), $($arg)*);
739            if needs_quotes {
740                Box::new(quote_serializer(serializer)) as Box<dyn Serializer + Send>
741            } else {
742                Box::new(serializer)
743            }
744        }};
745        ($make_serializer:path) => { quote_wrapper!($make_serializer,) };
746    }
747
748    let serializer = match dtype {
749        DataType::Int8 => quote_wrapper!(integer_serializer::<i8>),
750        DataType::UInt8 => quote_wrapper!(integer_serializer::<u8>),
751        DataType::Int16 => quote_wrapper!(integer_serializer::<i16>),
752        DataType::UInt16 => quote_wrapper!(integer_serializer::<u16>),
753        DataType::Int32 => quote_wrapper!(integer_serializer::<i32>),
754        DataType::UInt32 => quote_wrapper!(integer_serializer::<u32>),
755        DataType::Int64 => quote_wrapper!(integer_serializer::<i64>),
756        DataType::UInt64 => quote_wrapper!(integer_serializer::<u64>),
757        DataType::Int128 => quote_wrapper!(integer_serializer::<i128>),
758        DataType::UInt128 => quote_wrapper!(integer_serializer::<u128>),
759        DataType::Float16 => {
760            match (
761                options.decimal_comma,
762                options.float_precision,
763                options.float_scientific,
764            ) {
765                // standard decimal separator (period)
766                (false, Some(precision), Some(true)) => {
767                    quote_wrapper!(
768                        float_serializer_with_precision_scientific::<pf16>,
769                        precision
770                    )
771                },
772                (false, Some(precision), _) => {
773                    quote_wrapper!(
774                        float_serializer_with_precision_positional::<pf16>,
775                        precision
776                    )
777                },
778                (false, None, Some(true)) => {
779                    quote_wrapper!(float_serializer_no_precision_scientific::<pf16>)
780                },
781                (false, None, Some(false)) => {
782                    quote_wrapper!(float_serializer_no_precision_positional::<pf16>)
783                },
784                (false, None, None) => {
785                    quote_wrapper!(float_serializer_no_precision_autoformat_f16)
786                },
787
788                // comma as the decimal separator
789                (true, Some(precision), Some(true)) => quote_wrapper!(
790                    float_serializer_with_precision_scientific_decimal_comma::<pf16>,
791                    precision
792                ),
793                (true, Some(precision), _) => quote_wrapper!(
794                    float_serializer_with_precision_positional_decimal_comma::<pf16>,
795                    precision
796                ),
797                (true, None, Some(true)) => {
798                    quote_wrapper!(float_serializer_no_precision_scientific_decimal_comma::<pf16>)
799                },
800                (true, None, Some(false)) => {
801                    quote_wrapper!(float_serializer_no_precision_positional_decimal_comma::<pf16>)
802                },
803                (true, None, None) => {
804                    quote_wrapper!(float_serializer_no_precision_autoformat_decimal_comma_f16)
805                },
806            }
807        },
808        DataType::Float32 => {
809            match (
810                options.decimal_comma,
811                options.float_precision,
812                options.float_scientific,
813            ) {
814                // standard decimal separator (period)
815                (false, Some(precision), Some(true)) => {
816                    quote_wrapper!(float_serializer_with_precision_scientific::<f32>, precision)
817                },
818                (false, Some(precision), _) => {
819                    quote_wrapper!(float_serializer_with_precision_positional::<f32>, precision)
820                },
821                (false, None, Some(true)) => {
822                    quote_wrapper!(float_serializer_no_precision_scientific::<f32>)
823                },
824                (false, None, Some(false)) => {
825                    quote_wrapper!(float_serializer_no_precision_positional::<f32>)
826                },
827                (false, None, None) => {
828                    quote_wrapper!(float_serializer_no_precision_autoformat::<f32>)
829                },
830
831                // comma as the decimal separator
832                (true, Some(precision), Some(true)) => quote_wrapper!(
833                    float_serializer_with_precision_scientific_decimal_comma::<f32>,
834                    precision
835                ),
836                (true, Some(precision), _) => quote_wrapper!(
837                    float_serializer_with_precision_positional_decimal_comma::<f32>,
838                    precision
839                ),
840                (true, None, Some(true)) => {
841                    quote_wrapper!(float_serializer_no_precision_scientific_decimal_comma::<f32>)
842                },
843                (true, None, Some(false)) => {
844                    quote_wrapper!(float_serializer_no_precision_positional_decimal_comma::<f32>)
845                },
846                (true, None, None) => {
847                    quote_wrapper!(float_serializer_no_precision_autoformat_decimal_comma::<f32>)
848                },
849            }
850        },
851        DataType::Float64 => {
852            match (
853                options.decimal_comma,
854                options.float_precision,
855                options.float_scientific,
856            ) {
857                // standard decimal separator (period)
858                (false, Some(precision), Some(true)) => {
859                    quote_wrapper!(float_serializer_with_precision_scientific::<f64>, precision)
860                },
861                (false, Some(precision), _) => {
862                    quote_wrapper!(float_serializer_with_precision_positional::<f64>, precision)
863                },
864                (false, None, Some(true)) => {
865                    quote_wrapper!(float_serializer_no_precision_scientific::<f64>)
866                },
867                (false, None, Some(false)) => {
868                    quote_wrapper!(float_serializer_no_precision_positional::<f64>)
869                },
870                (false, None, None) => {
871                    quote_wrapper!(float_serializer_no_precision_autoformat::<f64>)
872                },
873
874                // comma as the decimal separator
875                (true, Some(precision), Some(true)) => quote_wrapper!(
876                    float_serializer_with_precision_scientific_decimal_comma::<f64>,
877                    precision
878                ),
879                (true, Some(precision), _) => quote_wrapper!(
880                    float_serializer_with_precision_positional_decimal_comma::<f64>,
881                    precision
882                ),
883                (true, None, Some(true)) => {
884                    quote_wrapper!(float_serializer_no_precision_scientific_decimal_comma::<f64>)
885                },
886                (true, None, Some(false)) => {
887                    quote_wrapper!(float_serializer_no_precision_positional_decimal_comma::<f64>)
888                },
889                (true, None, None) => {
890                    quote_wrapper!(float_serializer_no_precision_autoformat_decimal_comma::<f64>)
891                },
892            }
893        },
894        DataType::Null => quote_wrapper!(null_serializer),
895        DataType::Boolean => {
896            let array = array.as_any().downcast_ref().unwrap();
897            match options.quote_style {
898                QuoteStyle::Always => Box::new(quote_serializer(bool_serializer::<false>(array)))
899                    as Box<dyn Serializer + Send>,
900                QuoteStyle::NonNumeric => Box::new(bool_serializer::<true>(array)),
901                _ => Box::new(bool_serializer::<false>(array)),
902            }
903        },
904        #[cfg(feature = "dtype-date")]
905        DataType::Date => date_and_time_serializer(
906            &options.date_format,
907            "NaiveDate",
908            array,
909            chrono::NaiveDate::MAX,
910            arrow::temporal_conversions::date32_to_date,
911            |date, items| date.format_with_items(items),
912            options,
913        )?,
914        #[cfg(feature = "dtype-time")]
915        DataType::Time => date_and_time_serializer(
916            &options.time_format,
917            "NaiveTime",
918            array,
919            chrono::NaiveTime::MIN,
920            arrow::temporal_conversions::time64ns_to_time,
921            |time, items| time.format_with_items(items),
922            options,
923        )?,
924        #[cfg(feature = "dtype-datetime")]
925        DataType::Datetime(time_unit, _) => {
926            let format = chrono::format::StrftimeItems::new(_datetime_format)
927                .parse()
928                .map_err(|_| {
929                    polars_err!(
930                        ComputeError: "cannot format {} with format '{_datetime_format}'",
931                        if _time_zone.is_some() { "DateTime" } else { "NaiveDateTime" },
932                    )
933                })?;
934            use std::fmt::Write;
935            let sample_datetime = match _time_zone {
936                #[cfg(feature = "timezones")]
937                Some(time_zone) => time_zone
938                    .from_utc_datetime(&chrono::NaiveDateTime::MAX)
939                    .format_with_items(format.iter()),
940                #[cfg(not(feature = "timezones"))]
941                Some(_) => panic!("activate 'timezones' feature"),
942                None => chrono::NaiveDateTime::MAX.format_with_items(format.iter()),
943            };
944            // Fail fast for invalid format. This return error faster to the user, and allows us to not return
945            // `Result` from `serialize()`.
946            write!(IgnoreFmt, "{sample_datetime}").map_err(|_| {
947                polars_err!(
948                    ComputeError: "cannot format {} with format '{_datetime_format}'",
949                    if _time_zone.is_some() { "DateTime" } else { "NaiveDateTime" },
950                )
951            })?;
952
953            let array = array.as_any().downcast_ref().unwrap();
954
955            macro_rules! time_unit_serializer {
956                ($convert:ident) => {
957                    match _time_zone {
958                        #[cfg(feature = "timezones")]
959                        Some(time_zone) => {
960                            let callback = move |item, buf: &mut Vec<u8>| {
961                                let item = arrow::temporal_conversions::$convert(item);
962                                let item = time_zone.from_utc_datetime(&item);
963                                // We checked the format is valid above.
964                                let _ = write!(buf, "{}", item.format_with_items(format.iter()));
965                            };
966                            date_and_time_final_serializer(array, callback, options)
967                        },
968                        #[cfg(not(feature = "timezones"))]
969                        Some(_) => panic!("activate 'timezones' feature"),
970                        None => {
971                            let callback = move |item, buf: &mut Vec<u8>| {
972                                let item = arrow::temporal_conversions::$convert(item);
973                                // We checked the format is valid above.
974                                let _ = write!(buf, "{}", item.format_with_items(format.iter()));
975                            };
976                            date_and_time_final_serializer(array, callback, options)
977                        },
978                    }
979                };
980            }
981
982            match time_unit {
983                TimeUnit::Nanoseconds => time_unit_serializer!(timestamp_ns_to_datetime),
984                TimeUnit::Microseconds => time_unit_serializer!(timestamp_us_to_datetime),
985                TimeUnit::Milliseconds => time_unit_serializer!(timestamp_ms_to_datetime),
986            }
987        },
988        DataType::String => string_serializer(
989            |iter| Iterator::next(iter).expect(TOO_MANY_MSG),
990            options,
991            |arr| {
992                arr.as_any()
993                    .downcast_ref::<Utf8ViewArray>()
994                    .expect(ARRAY_MISMATCH_MSG)
995                    .iter()
996            },
997            array,
998        ),
999        #[cfg(feature = "dtype-categorical")]
1000        DataType::Categorical(_, mapping) | DataType::Enum(_, mapping) => {
1001            polars_core::with_match_categorical_physical_type!(dtype.cat_physical().unwrap(), |$C| {
1002                string_serializer(
1003                    |iter| {
1004                        let &idx: &<$C as PolarsCategoricalType>::Native = Iterator::next(iter).expect(TOO_MANY_MSG)?;
1005                        Some(unsafe { mapping.cat_to_str_unchecked(idx.as_cat()) })
1006                    },
1007                    options,
1008                    |arr| {
1009                        arr.as_any()
1010                            .downcast_ref::<PrimitiveArray<<$C as PolarsCategoricalType>::Native>>()
1011                            .expect(ARRAY_MISMATCH_MSG)
1012                            .iter()
1013                    },
1014                    array,
1015                )
1016            })
1017        },
1018        #[cfg(feature = "dtype-decimal")]
1019        DataType::Decimal(_, scale) => {
1020            quote_wrapper!(decimal_serializer, *scale)
1021        },
1022        _ => {
1023            polars_bail!(ComputeError: "datatype {dtype} cannot be written to CSV\n\nConsider using JSON or a binary format.")
1024        },
1025    };
1026    Ok(serializer)
1027}
1028
1029#[cfg(test)]
1030mod test {
1031    use arrow::array::NullArray;
1032    use polars_core::prelude::ArrowDataType;
1033
1034    use super::string_serializer;
1035    use crate::csv::write::options::{QuoteStyle, SerializeOptions};
1036
1037    // It is the most complex serializer with most edge cases, it definitely needs a comprehensive test.
1038    #[test]
1039    fn test_string_serializer() {
1040        #[track_caller]
1041        fn check_string_serialization(options: &SerializeOptions, s: Option<&str>, expected: &str) {
1042            let fake_array = NullArray::new(ArrowDataType::Null, 0);
1043            let mut serializer = string_serializer(|s| *s, options, |_| s, &fake_array);
1044            let mut buf = Vec::new();
1045            serializer.serialize(&mut buf, options);
1046            let serialized = std::str::from_utf8(&buf).unwrap();
1047            // Don't use `assert_eq!()` because it prints debug format and it's hard to read with all the escapes.
1048            if serialized != expected {
1049                panic!(
1050                    "CSV string {s:?} wasn't serialized correctly: expected: `{expected}`, got: `{serialized}`"
1051                );
1052            }
1053        }
1054
1055        let always_quote = SerializeOptions {
1056            quote_style: QuoteStyle::Always,
1057            ..SerializeOptions::default()
1058        };
1059        check_string_serialization(&always_quote, None, r#""""#);
1060        check_string_serialization(&always_quote, Some(""), r#""""#);
1061        check_string_serialization(&always_quote, Some("a"), r#""a""#);
1062        check_string_serialization(&always_quote, Some("\""), r#""""""#);
1063        check_string_serialization(&always_quote, Some("a\"\"b"), r#""a""""b""#);
1064
1065        let necessary_quote = SerializeOptions {
1066            quote_style: QuoteStyle::Necessary,
1067            ..SerializeOptions::default()
1068        };
1069        check_string_serialization(&necessary_quote, None, r#""#);
1070        check_string_serialization(&necessary_quote, Some(""), r#""""#);
1071        check_string_serialization(&necessary_quote, Some("a"), r#"a"#);
1072        check_string_serialization(&necessary_quote, Some("\""), r#""""""#);
1073        check_string_serialization(&necessary_quote, Some("a\"\"b"), r#""a""""b""#);
1074        check_string_serialization(&necessary_quote, Some("a b"), r#"a b"#);
1075        check_string_serialization(&necessary_quote, Some("a,b"), r#""a,b""#);
1076        check_string_serialization(&necessary_quote, Some("a\nb"), "\"a\nb\"");
1077        check_string_serialization(&necessary_quote, Some("a\rb"), "\"a\rb\"");
1078
1079        let never_quote = SerializeOptions {
1080            quote_style: QuoteStyle::Never,
1081            ..SerializeOptions::default()
1082        };
1083        check_string_serialization(&never_quote, None, "");
1084        check_string_serialization(&never_quote, Some(""), "");
1085        check_string_serialization(&never_quote, Some("a"), "a");
1086        check_string_serialization(&never_quote, Some("\""), "\"");
1087        check_string_serialization(&never_quote, Some("a\"\"b"), "a\"\"b");
1088        check_string_serialization(&never_quote, Some("a b"), "a b");
1089        check_string_serialization(&never_quote, Some("a,b"), "a,b");
1090        check_string_serialization(&never_quote, Some("a\nb"), "a\nb");
1091        check_string_serialization(&never_quote, Some("a\rb"), "a\rb");
1092
1093        let non_numeric_quote = SerializeOptions {
1094            quote_style: QuoteStyle::NonNumeric,
1095            ..SerializeOptions::default()
1096        };
1097        check_string_serialization(&non_numeric_quote, None, "");
1098        check_string_serialization(&non_numeric_quote, Some(""), r#""""#);
1099        check_string_serialization(&non_numeric_quote, Some("a"), r#""a""#);
1100        check_string_serialization(&non_numeric_quote, Some("\""), r#""""""#);
1101        check_string_serialization(&non_numeric_quote, Some("a\"\"b"), r#""a""""b""#);
1102        check_string_serialization(&non_numeric_quote, Some("a b"), r#""a b""#);
1103        check_string_serialization(&non_numeric_quote, Some("a,b"), r#""a,b""#);
1104        check_string_serialization(&non_numeric_quote, Some("a\nb"), "\"a\nb\"");
1105        check_string_serialization(&non_numeric_quote, Some("a\rb"), "\"a\rb\"");
1106    }
1107}