Skip to content

Commit 4349d1d

Browse files
authoredMar 4, 2025··
Store categories from pandas. (#11303)
- Change host columnar adapter to receive categories. - Store categories in the meta info. - Glue code for the Python interface.
1 parent 25a4997 commit 4349d1d

20 files changed

+788
-106
lines changed
 

‎R-package/src/Makevars.in

+1
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ OBJECTS= \
6666
$(PKGROOT)/src/gbm/gblinear_model.o \
6767
$(PKGROOT)/src/data/adapter.o \
6868
$(PKGROOT)/src/data/array_interface.o \
69+
$(PKGROOT)/src/data/cat_container.o \
6970
$(PKGROOT)/src/data/simple_dmatrix.o \
7071
$(PKGROOT)/src/data/data.o \
7172
$(PKGROOT)/src/data/sparse_page_raw_format.o \

‎R-package/src/Makevars.win.in

+1
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@ OBJECTS= \
6565
$(PKGROOT)/src/gbm/gblinear_model.o \
6666
$(PKGROOT)/src/data/adapter.o \
6767
$(PKGROOT)/src/data/array_interface.o \
68+
$(PKGROOT)/src/data/cat_container.o \
6869
$(PKGROOT)/src/data/simple_dmatrix.o \
6970
$(PKGROOT)/src/data/data.o \
7071
$(PKGROOT)/src/data/sparse_page_raw_format.o \

‎include/xgboost/data.h

+32-9
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/**
2-
* Copyright 2015-2024, XGBoost Contributors
2+
* Copyright 2015-2025, XGBoost Contributors
33
* \file data.h
44
* \brief The input data structure of xgboost.
55
* \author Tianqi Chen
@@ -8,8 +8,8 @@
88
#define XGBOOST_DATA_H_
99

1010
#include <dmlc/base.h>
11-
#include <dmlc/data.h>
12-
#include <dmlc/serializer.h>
11+
#include <dmlc/io.h> // for Stream
12+
#include <dmlc/serializer.h> // for Handler
1313
#include <xgboost/base.h>
1414
#include <xgboost/host_device_vector.h>
1515
#include <xgboost/linalg.h>
@@ -42,13 +42,16 @@ enum class FeatureType : uint8_t { kNumerical = 0, kCategorical = 1 };
4242

4343
enum class DataSplitMode : int { kRow = 0, kCol = 1 };
4444

45-
/*!
46-
* \brief Meta information about dataset, always sit in memory.
45+
// Forward declaration of the container used by the meta info.
46+
struct CatContainer;
47+
48+
/**
49+
* @brief Meta information about dataset, always sit in memory.
4750
*/
4851
class MetaInfo {
4952
public:
5053
/*! \brief number of data fields in MetaInfo */
51-
static constexpr uint64_t kNumField = 12;
54+
static constexpr uint64_t kNumField = 13;
5255

5356
/*! \brief number of rows in the data */
5457
bst_idx_t num_row_{0}; // NOLINT
@@ -100,9 +103,9 @@ class MetaInfo {
100103
*/
101104
HostDeviceVector<float> feature_weights;
102105

103-
/*! \brief default constructor */
104-
MetaInfo() = default;
106+
MetaInfo();
105107
MetaInfo(MetaInfo&& that) = default;
108+
MetaInfo(MetaInfo const& that) = delete;
106109
MetaInfo& operator=(MetaInfo&& that) = default;
107110
MetaInfo& operator=(MetaInfo const& that) = delete;
108111

@@ -205,6 +208,16 @@ class MetaInfo {
205208
* @brief Flag for whether the DMatrix has categorical features.
206209
*/
207210
bool HasCategorical() const { return has_categorical_; }
211+
/**
212+
* @brief Getters for categories.
213+
*/
214+
[[nodiscard]] CatContainer const* Cats() const;
215+
[[nodiscard]] CatContainer* Cats();
216+
[[nodiscard]] std::shared_ptr<CatContainer const> CatsShared() const;
217+
/**
218+
* @brief Setter for categories.
219+
*/
220+
void Cats(std::shared_ptr<CatContainer> cats);
208221

209222
private:
210223
void SetInfoFromHost(Context const* ctx, StringView key, Json arr);
@@ -213,6 +226,8 @@ class MetaInfo {
213226
/*! \brief argsort of labels */
214227
mutable std::vector<size_t> label_order_cache_;
215228
bool has_categorical_{false};
229+
230+
std::shared_ptr<CatContainer> cats_;
216231
};
217232

218233
/*! \brief Element from a sparse vector */
@@ -691,7 +706,15 @@ class DMatrix {
691706
* @param slice_id Index of the current slice
692707
* @return DMatrix containing the slice of columns
693708
*/
694-
virtual DMatrix *SliceCol(int num_slices, int slice_id) = 0;
709+
virtual DMatrix* SliceCol(int num_slices, int slice_id) = 0;
710+
/**
711+
* @brief Accessor for the string representation of the categories.
712+
*/
713+
CatContainer const* Cats() const { return this->CatsShared().get(); }
714+
[[nodiscard]] virtual std::shared_ptr<CatContainer const> CatsShared() const {
715+
LOG(FATAL) << "Not implemented for the current DMatrix type.";
716+
return nullptr;
717+
}
695718

696719
protected:
697720
virtual BatchSet<SparsePage> GetRowBatches() = 0;

‎include/xgboost/predictor.h

+2-1
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,8 @@
55
* performs predictions for a gradient booster.
66
*/
77
#pragma once
8-
#include <xgboost/base.h>
8+
#include <dmlc/registry.h> // for FunctionRegEntryBase
9+
#include <xgboost/base.h> // for bst_tree_t
910
#include <xgboost/cache.h> // for DMatrixCache
1011
#include <xgboost/context.h> // for Context
1112
#include <xgboost/context.h>

‎ops/script/lint_python.py

+2
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ class LintersPaths:
2727
"tests/python/test_early_stopping.py",
2828
"tests/python/test_multi_target.py",
2929
"tests/python/test_objectives.py",
30+
"tests/python/test_ordinal.py",
3031
"tests/python/test_predict.py",
3132
"tests/python/test_quantile_dmatrix.py",
3233
"tests/python/test_tracker.py",
@@ -101,6 +102,7 @@ class LintersPaths:
101102
"tests/python-gpu/load_pickle.py",
102103
"tests/python-gpu/test_gpu_training_continuation.py",
103104
"tests/python/test_model_io.py",
105+
"tests/python/test_ordinal.py",
104106
"tests/test_distributed/test_federated/",
105107
"tests/test_distributed/test_gpu_federated/",
106108
"tests/test_distributed/test_with_dask/test_ranking.py",

‎python-package/xgboost/_data_utils.py

+186-16
Original file line numberDiff line numberDiff line change
@@ -2,15 +2,35 @@
22

33
import copy
44
import ctypes
5+
import functools
56
import json
6-
from typing import Literal, Optional, Protocol, Tuple, Type, TypedDict, Union, cast
7+
from typing import (
8+
TYPE_CHECKING,
9+
Any,
10+
Dict,
11+
Literal,
12+
Optional,
13+
Protocol,
14+
Tuple,
15+
Type,
16+
TypedDict,
17+
TypeGuard,
18+
Union,
19+
cast,
20+
overload,
21+
)
722

823
import numpy as np
924

10-
from ._typing import CNumericPtr, DataType, NumpyOrCupy
11-
from .compat import import_cupy
25+
from ._typing import CNumericPtr, DataType, NumpyDType, NumpyOrCupy
26+
from .compat import import_cupy, lazy_isinstance
27+
28+
if TYPE_CHECKING:
29+
import pandas as pd
30+
import pyarrow as pa
1231

1332

33+
# Used for accepting inputs for numpy and cupy arrays
1434
class _ArrayLikeArg(Protocol):
1535
@property
1636
def __array_interface__(self) -> "ArrayInf": ...
@@ -44,19 +64,27 @@ def shape(self) -> Tuple[int, int]:
4464
},
4565
)
4666

67+
StringArray = TypedDict("StringArray", {"offsets": ArrayInf, "values": ArrayInf})
68+
4769

4870
def array_hasobject(data: DataType) -> bool:
4971
"""Whether the numpy array has object dtype."""
5072
return hasattr(data.dtype, "hasobject") and data.dtype.hasobject
5173

5274

53-
def cuda_array_interface(data: DataType) -> bytes:
54-
"""Make cuda array interface str."""
75+
def cuda_array_interface_dict(data: _CudaArrayLikeArg) -> ArrayInf:
76+
"""Returns a dictionary storing the CUDA array interface."""
5577
if array_hasobject(data):
5678
raise ValueError("Input data contains `object` dtype. Expecting numeric data.")
57-
interface = data.__cuda_array_interface__
58-
if "mask" in interface:
59-
interface["mask"] = interface["mask"].__cuda_array_interface__
79+
ainf = data.__cuda_array_interface__
80+
if "mask" in ainf:
81+
ainf["mask"] = ainf["mask"].__cuda_array_interface__ # type: ignore
82+
return cast(ArrayInf, ainf)
83+
84+
85+
def cuda_array_interface(data: _CudaArrayLikeArg) -> bytes:
86+
"""Make cuda array interface str."""
87+
interface = cuda_array_interface_dict(data)
6088
interface_str = bytes(json.dumps(interface), "utf-8")
6189
return interface_str
6290

@@ -107,6 +135,12 @@ def __cuda_array_interface__(self, interface: ArrayInf) -> None:
107135
return out
108136

109137

138+
# Default constant value for CUDA per-thread stream.
139+
STREAM_PER_THREAD = 2
140+
141+
142+
# Typing is not strict as there are subtle differences between CUDA array interface and
143+
# array interface. We handle them uniformly for now.
110144
def make_array_interface(
111145
ptr: Union[CNumericPtr, int],
112146
shape: Tuple[int, ...],
@@ -134,21 +168,157 @@ def make_array_interface(
134168
return array
135169

136170
array["data"] = (addr, True)
137-
if is_cuda:
138-
array["stream"] = 2
171+
if is_cuda and "stream" not in array:
172+
array["stream"] = STREAM_PER_THREAD
139173
array["shape"] = shape
140174
array["strides"] = None
141175
return array
142176

143177

144-
def array_interface_dict(data: np.ndarray) -> ArrayInf:
145-
"""Convert array interface into a Python dictionary."""
178+
def is_arrow_dict(data: Any) -> TypeGuard["pa.DictionaryArray"]:
179+
"""Is this an arrow dictionary array?"""
180+
return lazy_isinstance(data, "pyarrow.lib", "DictionaryArray")
181+
182+
183+
class PdCatAccessor(Protocol):
184+
"""Protocol for pandas cat accessor."""
185+
186+
@property
187+
def categories( # pylint: disable=missing-function-docstring
188+
self,
189+
) -> "pd.Index": ...
190+
191+
@property
192+
def codes(self) -> "pd.Series": ... # pylint: disable=missing-function-docstring
193+
194+
@property
195+
def dtype(self) -> np.dtype: ... # pylint: disable=missing-function-docstring
196+
197+
def to_arrow( # pylint: disable=missing-function-docstring
198+
self,
199+
) -> Union["pa.StringArray", "pa.IntegerArray"]: ...
200+
201+
@property
202+
def __cuda_array_interface__(self) -> ArrayInf: ...
203+
204+
205+
def _is_pd_cat(data: Any) -> TypeGuard[PdCatAccessor]:
206+
# Test pd.Series.cat, not pd.Series
207+
return hasattr(data, "categories") and hasattr(data, "codes")
208+
209+
210+
@functools.cache
211+
def _arrow_typestr() -> Dict["pa.DataType", str]:
212+
import pyarrow as pa
213+
214+
mapping = {
215+
pa.int8(): "<i1",
216+
pa.int16(): "<i2",
217+
pa.int32(): "<i4",
218+
pa.int64(): "<i8",
219+
pa.uint8(): "<u1",
220+
pa.uint16(): "<u2",
221+
pa.uint32(): "<u4",
222+
pa.uint64(): "<u8",
223+
}
224+
225+
return mapping
226+
227+
228+
def npstr_to_arrow_strarr(strarr: np.ndarray) -> Tuple[np.ndarray, str]:
229+
"""Convert a numpy string array to an arrow string array."""
230+
lenarr = np.vectorize(len)
231+
offsets = np.cumsum(np.concatenate([np.array([0], dtype=np.int64), lenarr(strarr)]))
232+
values = strarr.sum()
233+
assert "\0" not in values # arrow string array doesn't need null terminal
234+
return offsets.astype(np.int32), values
235+
236+
237+
def _ensure_np_dtype(
238+
data: DataType, dtype: Optional[NumpyDType]
239+
) -> Tuple[np.ndarray, Optional[NumpyDType]]:
240+
"""Ensure the np array has correct type and is contiguous."""
241+
if array_hasobject(data) or data.dtype in [np.float16, np.bool_]:
242+
dtype = np.float32
243+
data = data.astype(dtype, copy=False)
244+
if not data.flags.aligned:
245+
data = np.require(data, requirements="A")
246+
return data, dtype
247+
248+
249+
@overload
250+
def array_interface_dict(data: np.ndarray) -> ArrayInf: ...
251+
252+
253+
@overload
254+
def array_interface_dict(
255+
data: PdCatAccessor,
256+
) -> Tuple[StringArray, ArrayInf, Tuple]: ...
257+
258+
259+
@overload
260+
def array_interface_dict(
261+
data: "pa.DictionaryArray",
262+
) -> Tuple[StringArray, ArrayInf, Tuple]: ...
263+
264+
265+
def array_interface_dict( # pylint: disable=too-many-locals
266+
data: Union[np.ndarray, PdCatAccessor],
267+
) -> Union[ArrayInf, Tuple[StringArray, ArrayInf, Optional[Tuple]]]:
268+
"""Returns an array interface from the input."""
269+
# Handle categorical values
270+
if _is_pd_cat(data):
271+
cats = data.categories
272+
# pandas uses -1 to represent missing values for categorical features
273+
codes = data.codes.replace(-1, np.nan)
274+
275+
if np.issubdtype(cats.dtype, np.floating) or np.issubdtype(
276+
cats.dtype, np.integer
277+
):
278+
# Numeric index type
279+
name_values = cats.values
280+
jarr_values = array_interface_dict(name_values)
281+
code_values = codes.values
282+
jarr_codes = array_interface_dict(code_values)
283+
return jarr_values, jarr_codes, (name_values, code_values)
284+
285+
# String index type
286+
name_offsets, name_values = npstr_to_arrow_strarr(cats.values)
287+
name_offsets, _ = _ensure_np_dtype(name_offsets, np.int32)
288+
joffsets = array_interface_dict(name_offsets)
289+
bvalues = name_values.encode("utf-8")
290+
ptr = ctypes.c_void_p.from_buffer(ctypes.c_char_p(bvalues)).value
291+
assert ptr is not None
292+
293+
jvalues: ArrayInf = {
294+
"data": (ptr, True),
295+
"typestr": "|i1",
296+
"shape": (len(name_values),),
297+
"strides": None,
298+
"version": 3,
299+
"mask": None,
300+
}
301+
jnames: StringArray = {"offsets": joffsets, "values": jvalues}
302+
303+
code_values = codes.values
304+
jcodes = array_interface_dict(code_values)
305+
306+
buf = (
307+
name_offsets,
308+
name_values,
309+
bvalues,
310+
code_values,
311+
) # store temporary values
312+
return jnames, jcodes, buf
313+
314+
# Handle numeric values
315+
assert isinstance(data, np.ndarray)
146316
if array_hasobject(data):
147317
raise ValueError("Input data contains `object` dtype. Expecting numeric data.")
148-
arrinf = data.__array_interface__
149-
if "mask" in arrinf:
150-
arrinf["mask"] = arrinf["mask"].__array_interface__
151-
return cast(ArrayInf, arrinf)
318+
ainf = data.__array_interface__
319+
if "mask" in ainf:
320+
ainf["mask"] = ainf["mask"].__array_interface__
321+
return cast(ArrayInf, ainf)
152322

153323

154324
def array_interface(data: np.ndarray) -> bytes:

0 commit comments

Comments
 (0)
Please sign in to comment.