polars_core/frame/row/
transpose.rs

1use std::borrow::Cow;
2
3use either::Either;
4
5use super::*;
6
7impl DataFrame {
8    pub(crate) fn transpose_from_dtype(
9        &self,
10        dtype: &DataType,
11        keep_names_as: Option<PlSmallStr>,
12        names_out: &[PlSmallStr],
13    ) -> PolarsResult<DataFrame> {
14        let new_width = self.height();
15        let new_height = self.width();
16        // Allocate space for the transposed columns, putting the "row names" first if needed
17        let mut cols_t = match keep_names_as {
18            None => Vec::<Column>::with_capacity(new_width),
19            Some(name) => {
20                let mut tmp = Vec::<Column>::with_capacity(new_width + 1);
21                tmp.push(
22                    StringChunked::from_iter_values(
23                        name,
24                        self.get_column_names_owned().into_iter(),
25                    )
26                    .into_column(),
27                );
28                tmp
29            },
30        };
31
32        let cols = &self.columns;
33        match dtype {
34            #[cfg(feature = "dtype-i8")]
35            DataType::Int8 => numeric_transpose::<Int8Type>(cols, names_out, &mut cols_t),
36            #[cfg(feature = "dtype-i16")]
37            DataType::Int16 => numeric_transpose::<Int16Type>(cols, names_out, &mut cols_t),
38            DataType::Int32 => numeric_transpose::<Int32Type>(cols, names_out, &mut cols_t),
39            DataType::Int64 => numeric_transpose::<Int64Type>(cols, names_out, &mut cols_t),
40            #[cfg(feature = "dtype-u8")]
41            DataType::UInt8 => numeric_transpose::<UInt8Type>(cols, names_out, &mut cols_t),
42            #[cfg(feature = "dtype-u16")]
43            DataType::UInt16 => numeric_transpose::<UInt16Type>(cols, names_out, &mut cols_t),
44            DataType::UInt32 => numeric_transpose::<UInt32Type>(cols, names_out, &mut cols_t),
45            DataType::UInt64 => numeric_transpose::<UInt64Type>(cols, names_out, &mut cols_t),
46            DataType::Float32 => numeric_transpose::<Float32Type>(cols, names_out, &mut cols_t),
47            DataType::Float64 => numeric_transpose::<Float64Type>(cols, names_out, &mut cols_t),
48            #[cfg(feature = "object")]
49            DataType::Object(_) => {
50                // this requires to support `Object` in Series::iter which we don't yet
51                polars_bail!(InvalidOperation: "Object dtype not supported in 'transpose'")
52            },
53            _ => {
54                let phys_dtype = dtype.to_physical();
55                let mut buffers = (0..new_width)
56                    .map(|_| {
57                        let buf: AnyValueBufferTrusted = (&phys_dtype, new_height).into();
58                        buf
59                    })
60                    .collect::<Vec<_>>();
61
62                let columns = self
63                    .materialized_column_iter()
64                    // first cast to supertype before casting to physical to ensure units are correct
65                    .map(|s| s.cast(dtype).unwrap().cast(&phys_dtype).unwrap())
66                    .collect::<Vec<_>>();
67
68                // this is very expensive. A lot of cache misses here.
69                // This is the part that is performance critical.
70                for s in columns {
71                    polars_ensure!(s.dtype() == &phys_dtype, ComputeError: "cannot transpose with supertype: {}", dtype);
72                    s.iter().zip(buffers.iter_mut()).for_each(|(av, buf)| {
73                        // SAFETY: we checked the type and we borrow
74                        unsafe {
75                            buf.add_unchecked_borrowed_physical(&av);
76                        }
77                    });
78                }
79                cols_t.extend(buffers.into_iter().zip(names_out).map(|(buf, name)| {
80                    // SAFETY: we are casting back to the supertype
81                    let mut s = unsafe { buf.into_series().cast_unchecked(dtype).unwrap() };
82                    s.rename(name.clone());
83                    s.into()
84                }));
85            },
86        };
87        Ok(unsafe { DataFrame::new_no_checks(new_height, cols_t) })
88    }
89
90    pub fn transpose(
91        &mut self,
92        keep_names_as: Option<&str>,
93        new_col_names: Option<Either<String, Vec<String>>>,
94    ) -> PolarsResult<DataFrame> {
95        let new_col_names = match new_col_names {
96            None => None,
97            Some(Either::Left(v)) => Some(Either::Left(v.into())),
98            Some(Either::Right(v)) => Some(Either::Right(
99                v.into_iter().map(Into::into).collect::<Vec<_>>(),
100            )),
101        };
102
103        self.transpose_impl(keep_names_as, new_col_names)
104    }
105    /// Transpose a DataFrame. This is a very expensive operation.
106    pub fn transpose_impl(
107        &mut self,
108        keep_names_as: Option<&str>,
109        new_col_names: Option<Either<PlSmallStr, Vec<PlSmallStr>>>,
110    ) -> PolarsResult<DataFrame> {
111        // We must iterate columns as [`AnyValue`], so we must be contiguous.
112        self.as_single_chunk_par();
113
114        let mut df = Cow::Borrowed(self); // Can't use self because we might drop a name column
115        let names_out = match new_col_names {
116            None => (0..self.height())
117                .map(|i| format_pl_smallstr!("column_{i}"))
118                .collect(),
119            Some(cn) => match cn {
120                Either::Left(name) => {
121                    let new_names = self.column(name.as_str()).and_then(|x| x.str())?;
122                    polars_ensure!(new_names.null_count() == 0, ComputeError: "Column with new names can't have null values");
123                    df = Cow::Owned(self.drop(name.as_str())?);
124                    new_names
125                        .into_no_null_iter()
126                        .map(PlSmallStr::from_str)
127                        .collect()
128                },
129                Either::Right(names) => {
130                    polars_ensure!(names.len() == self.height(), ShapeMismatch: "Length of new column names must be the same as the row count");
131                    names
132                },
133            },
134        };
135        if let Some(cn) = keep_names_as {
136            // Check that the column name we're using for the original column names is unique before
137            // wasting time transposing
138            polars_ensure!(names_out.iter().all(|a| a.as_str() != cn), Duplicate: "{} is already in output column names", cn)
139        }
140        polars_ensure!(
141            df.height() != 0 && df.width() != 0,
142            NoData: "unable to transpose an empty DataFrame"
143        );
144        let dtype = df.get_supertype().unwrap()?;
145        match dtype {
146            #[cfg(feature = "dtype-categorical")]
147            DataType::Categorical(_, _) | DataType::Enum(_, _) => {
148                let mut valid = true;
149                let mut rev_map: Option<&Arc<RevMapping>> = None;
150                for s in self.columns.iter() {
151                    if let DataType::Categorical(Some(col_rev_map), _)
152                    | DataType::Enum(Some(col_rev_map), _) = &s.dtype()
153                    {
154                        match rev_map {
155                            Some(rev_map) => valid = valid && rev_map.same_src(col_rev_map),
156                            None => {
157                                rev_map = Some(col_rev_map);
158                            },
159                        }
160                    }
161                }
162                polars_ensure!(valid, string_cache_mismatch);
163            },
164            _ => {},
165        }
166        df.transpose_from_dtype(&dtype, keep_names_as.map(PlSmallStr::from_str), &names_out)
167    }
168}
169
170#[inline]
171unsafe fn add_value<T: NumericNative>(
172    values_buf_ptr: usize,
173    col_idx: usize,
174    row_idx: usize,
175    value: T,
176) {
177    let column = (*(values_buf_ptr as *mut Vec<Vec<T>>)).get_unchecked_mut(col_idx);
178    let el_ptr = column.as_mut_ptr();
179    *el_ptr.add(row_idx) = value;
180}
181
182// This just fills a pre-allocated mutable series vector, which may have a name column.
183// Nothing is returned and the actual DataFrame is constructed above.
184pub(super) fn numeric_transpose<T>(
185    cols: &[Column],
186    names_out: &[PlSmallStr],
187    cols_t: &mut Vec<Column>,
188) where
189    T: PolarsNumericType,
190    //S: AsRef<str>,
191    ChunkedArray<T>: IntoSeries,
192{
193    let new_width = cols[0].len();
194    let new_height = cols.len();
195
196    let has_nulls = cols.iter().any(|s| s.null_count() > 0);
197
198    let mut values_buf: Vec<Vec<T::Native>> = (0..new_width)
199        .map(|_| Vec::with_capacity(new_height))
200        .collect();
201    let mut validity_buf: Vec<_> = if has_nulls {
202        // we first use bools instead of bits, because we can access these in parallel without aliasing
203        (0..new_width).map(|_| vec![true; new_height]).collect()
204    } else {
205        (0..new_width).map(|_| vec![]).collect()
206    };
207
208    // work with *mut pointers because we it is UB write to &refs.
209    let values_buf_ptr = &mut values_buf as *mut Vec<Vec<T::Native>> as usize;
210    let validity_buf_ptr = &mut validity_buf as *mut Vec<Vec<bool>> as usize;
211
212    POOL.install(|| {
213        cols.iter()
214            .map(Column::as_materialized_series)
215            .enumerate()
216            .for_each(|(row_idx, s)| {
217                let s = s.cast(&T::get_dtype()).unwrap();
218                let ca = s.unpack::<T>().unwrap();
219
220                // SAFETY:
221                // we access in parallel, but every access is unique, so we don't break aliasing rules
222                // we also ensured we allocated enough memory, so we never reallocate and thus
223                // the pointers remain valid.
224                if has_nulls {
225                    for (col_idx, opt_v) in ca.iter().enumerate() {
226                        match opt_v {
227                            None => unsafe {
228                                let column = (*(validity_buf_ptr as *mut Vec<Vec<bool>>))
229                                    .get_unchecked_mut(col_idx);
230                                let el_ptr = column.as_mut_ptr();
231                                *el_ptr.add(row_idx) = false;
232                                // we must initialize this memory otherwise downstream code
233                                // might access uninitialized memory when the masked out values
234                                // are changed.
235                                add_value(values_buf_ptr, col_idx, row_idx, T::Native::default());
236                            },
237                            Some(v) => unsafe {
238                                add_value(values_buf_ptr, col_idx, row_idx, v);
239                            },
240                        }
241                    }
242                } else {
243                    for (col_idx, v) in ca.into_no_null_iter().enumerate() {
244                        unsafe {
245                            let column = (*(values_buf_ptr as *mut Vec<Vec<T::Native>>))
246                                .get_unchecked_mut(col_idx);
247                            let el_ptr = column.as_mut_ptr();
248                            *el_ptr.add(row_idx) = v;
249                        }
250                    }
251                }
252            })
253    });
254
255    let par_iter = values_buf
256        .into_par_iter()
257        .zip(validity_buf)
258        .zip(names_out)
259        .map(|((mut values, validity), name)| {
260            // SAFETY:
261            // all values are written we can now set len
262            unsafe {
263                values.set_len(new_height);
264            }
265
266            let validity = if has_nulls {
267                let validity = Bitmap::from_trusted_len_iter(validity.iter().copied());
268                if validity.unset_bits() > 0 {
269                    Some(validity)
270                } else {
271                    None
272                }
273            } else {
274                None
275            };
276
277            let arr = PrimitiveArray::<T::Native>::new(
278                T::get_dtype().to_arrow(CompatLevel::newest()),
279                values.into(),
280                validity,
281            );
282            ChunkedArray::with_chunk(name.clone(), arr).into_column()
283        });
284    POOL.install(|| cols_t.par_extend(par_iter));
285}
286
287#[cfg(test)]
288mod test {
289    use super::*;
290
291    #[test]
292    fn test_transpose() -> PolarsResult<()> {
293        let mut df = df![
294            "a" => [1, 2, 3],
295            "b" => [10, 20, 30],
296        ]?;
297
298        let out = df.transpose(None, None)?;
299        let expected = df![
300            "column_0" => [1, 10],
301            "column_1" => [2, 20],
302            "column_2" => [3, 30],
303
304        ]?;
305        assert!(out.equals_missing(&expected));
306
307        let mut df = df![
308            "a" => [Some(1), None, Some(3)],
309            "b" => [Some(10), Some(20), None],
310        ]?;
311        let out = df.transpose(None, None)?;
312        let expected = df![
313            "column_0" => [1, 10],
314            "column_1" => [None, Some(20)],
315            "column_2" => [Some(3), None],
316
317        ]?;
318        assert!(out.equals_missing(&expected));
319
320        let mut df = df![
321            "a" => ["a", "b", "c"],
322            "b" => [Some(10), Some(20), None],
323        ]?;
324        let out = df.transpose(None, None)?;
325        let expected = df![
326            "column_0" => ["a", "10"],
327            "column_1" => ["b", "20"],
328            "column_2" => [Some("c"), None],
329
330        ]?;
331        assert!(out.equals_missing(&expected));
332        Ok(())
333    }
334}
polars_core/frame/row/transpose.rs

polars_core/frame/row/
transpose.rs