Skip to content

Commit d421b9e

Browse files
authored
[BUG] Test Failures caused because of pandas 3 (#1628)
#### Metadata * Reference Issue: fixes #1627 * What does this PR implement/fix? Explain your changes. This PR fixes the 7 recurring bugs across all current PRs because of pandas 3: 1. `test_get_data_pandas` bug: Solved type error for dataframe columns having `str` datatype for `pandas==3` and `object` for older versions. 2. `test_get_sparse_dataset_dataframe`, `test_get_sparse_dataset_rowid_and_ignore_and_target`, and `test_get_sparse_dataset_dataframe_with_target` bug: typecasting `type_` to a np array. 3. bugs in `test_flow_functions.py`: `ext_version` can now be `nan` because of `pandas 3`.
1 parent cf8e9db commit d421b9e

File tree

4 files changed

+28
-10
lines changed

4 files changed

+28
-10
lines changed

.github/workflows/test.yml

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ concurrency:
2222

2323
jobs:
2424
test:
25-
name: (${{ matrix.os }},Py${{ matrix.python-version }},sk${{ matrix.scikit-learn }},sk-only:${{ matrix.sklearn-only }})
25+
name: (${{ matrix.os }},Py${{ matrix.python-version }},sk${{ matrix.scikit-learn }}${{ matrix.pandas-version != '' && format(',pd:{0}', matrix.pandas-version) || '' }},sk-only:${{ matrix.sklearn-only }})
2626
runs-on: ${{ matrix.os }}
2727

2828
strategy:
@@ -64,6 +64,14 @@ jobs:
6464
sklearn-only: "false"
6565
code-cov: true
6666

67+
# Pandas 2 run
68+
- os: ubuntu-latest
69+
python-version: "3.12"
70+
scikit-learn: "1.5.*"
71+
sklearn-only: "false"
72+
pandas-version: "2.*"
73+
code-cov: false
74+
6775
steps:
6876
- uses: actions/checkout@v6
6977
with:
@@ -74,10 +82,16 @@ jobs:
7482
with:
7583
python-version: ${{ matrix.python-version }}
7684

77-
- name: Install test dependencies and scikit-learn
85+
- name: Install test dependencies, scikit-learn, and optional pandas
86+
shell: bash
7887
run: |
7988
python -m pip install --upgrade pip
8089
pip install -e .[test] scikit-learn==${{ matrix.scikit-learn }}
90+
91+
if [ "${{ matrix.pandas-version }}" != "" ]; then
92+
echo "Installing specific pandas version: ${{ matrix.pandas-version }}"
93+
pip install "pandas==${{ matrix.pandas-version }}"
94+
fi
8195
8296
- name: Store repository status
8397
id: status-before

openml/datasets/dataset.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -488,7 +488,7 @@ def _parse_data_from_arff( # noqa: C901, PLR0912, PLR0915
488488
try:
489489
# checks if the strings which should be the class labels
490490
# can be encoded into integers
491-
pd.factorize(type_)[0]
491+
pd.factorize(np.array(type_))[0]
492492
except ValueError as e:
493493
raise ValueError(
494494
"Categorical data needs to be numeric when using sparse ARFF."

tests/test_datasets/test_dataset.py

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -102,21 +102,24 @@ def test_get_data_pandas(self):
102102
assert isinstance(data, pd.DataFrame)
103103
assert data.shape[1] == len(self.titanic.features)
104104
assert data.shape[0] == 1309
105+
# Dynamically detect what this version of Pandas calls string columns.
106+
str_dtype = data["name"].dtype.name
107+
105108
col_dtype = {
106109
"pclass": "uint8",
107110
"survived": "category",
108-
"name": "object",
111+
"name": str_dtype,
109112
"sex": "category",
110113
"age": "float64",
111114
"sibsp": "uint8",
112115
"parch": "uint8",
113-
"ticket": "object",
116+
"ticket": str_dtype,
114117
"fare": "float64",
115-
"cabin": "object",
118+
"cabin": str_dtype,
116119
"embarked": "category",
117-
"boat": "object",
120+
"boat": str_dtype,
118121
"body": "float64",
119-
"home.dest": "object",
122+
"home.dest": str_dtype,
120123
}
121124
for col_name in data.columns:
122125
assert data[col_name].dtype.name == col_dtype[col_name]
@@ -357,7 +360,7 @@ def setUp(self):
357360
def test_get_sparse_dataset_dataframe_with_target(self):
358361
X, y, _, attribute_names = self.sparse_dataset.get_data(target="class")
359362
assert isinstance(X, pd.DataFrame)
360-
assert isinstance(X.dtypes[0], pd.SparseDtype)
363+
assert isinstance(X.dtypes.iloc[0], pd.SparseDtype)
361364
assert X.shape == (600, 20000)
362365

363366
assert isinstance(y, pd.Series)

tests/test_flows/test_flow_functions.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,8 +41,9 @@ def _check_flow(self, flow):
4141
assert isinstance(flow["full_name"], str)
4242
assert isinstance(flow["version"], str)
4343
# There are some runs on openml.org that can have an empty external version
44+
ext_version = flow["external_version"]
4445
ext_version_str_or_none = (
45-
isinstance(flow["external_version"], str) or flow["external_version"] is None
46+
isinstance(ext_version, str) or ext_version is None or pd.isna(ext_version)
4647
)
4748
assert ext_version_str_or_none
4849

0 commit comments

Comments
 (0)