[BUG] Test Failures caused because of pandas 3 (#1628)

satvshr · web-flow · commit d421b9ec58bc · 2026-01-26T12:30:51.000+01:00
#### Metadata * Reference Issue: fixes #1627 * What does this PR implement/fix? Explain your changes. This PR fixes the 7 recurring bugs across all current PRs because of pandas 3: 1. `test_get_data_pandas` bug: Solved type error for dataframe columns having `str` datatype for `pandas==3` and `object` for older versions. 2. `test_get_sparse_dataset_dataframe`, `test_get_sparse_dataset_rowid_and_ignore_and_target`, and `test_get_sparse_dataset_dataframe_with_target` bug: typecasting `type_` to a np array. 3. bugs in `test_flow_functions.py`: `ext_version` can now be `nan` because of `pandas 3`.
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -22,7 +22,7 @@ concurrency:
 
 jobs:
   test:
-    name: (${{ matrix.os }},Py${{ matrix.python-version }},sk${{ matrix.scikit-learn }},sk-only:${{ matrix.sklearn-only }})
+    name: (${{ matrix.os }},Py${{ matrix.python-version }},sk${{ matrix.scikit-learn }}${{ matrix.pandas-version != '' && format(',pd:{0}', matrix.pandas-version) || '' }},sk-only:${{ matrix.sklearn-only }})
     runs-on: ${{ matrix.os }}
 
     strategy:
@@ -64,6 +64,14 @@ jobs:
             sklearn-only: "false"
             code-cov: true
 
+          # Pandas 2 run
+          - os: ubuntu-latest
+            python-version: "3.12"
+            scikit-learn: "1.5.*"
+            sklearn-only: "false"
+            pandas-version: "2.*"
+            code-cov: false
+
     steps:
     - uses: actions/checkout@v6
       with:
@@ -74,10 +82,16 @@ jobs:
       with:
         python-version: ${{ matrix.python-version }}
 
-    - name: Install test dependencies and scikit-learn
+    - name: Install test dependencies, scikit-learn, and optional pandas
+      shell: bash
       run: |
         python -m pip install --upgrade pip
         pip install -e .[test] scikit-learn==${{ matrix.scikit-learn }}
+        
+        if [ "${{ matrix.pandas-version }}" != "" ]; then
+          echo "Installing specific pandas version: ${{ matrix.pandas-version }}"
+          pip install "pandas==${{ matrix.pandas-version }}"
+        fi
 
     - name: Store repository status
       id: status-before
diff --git a/openml/datasets/dataset.py b/openml/datasets/dataset.py
@@ -488,7 +488,7 @@ def _parse_data_from_arff(  # noqa: C901, PLR0912, PLR0915
                 try:
                     # checks if the strings which should be the class labels
                     # can be encoded into integers
-                    pd.factorize(type_)[0]
+                    pd.factorize(np.array(type_))[0]
                 except ValueError as e:
                     raise ValueError(
                         "Categorical data needs to be numeric when using sparse ARFF."
diff --git a/tests/test_datasets/test_dataset.py b/tests/test_datasets/test_dataset.py
@@ -102,21 +102,24 @@ def test_get_data_pandas(self):
         assert isinstance(data, pd.DataFrame)
         assert data.shape[1] == len(self.titanic.features)
         assert data.shape[0] == 1309
+        # Dynamically detect what this version of Pandas calls string columns.
+        str_dtype = data["name"].dtype.name
+
         col_dtype = {
             "pclass": "uint8",
             "survived": "category",
-            "name": "object",
+            "name": str_dtype,
             "sex": "category",
             "age": "float64",
             "sibsp": "uint8",
             "parch": "uint8",
-            "ticket": "object",
+            "ticket": str_dtype,
             "fare": "float64",
-            "cabin": "object",
+            "cabin": str_dtype,
             "embarked": "category",
-            "boat": "object",
+            "boat": str_dtype,
             "body": "float64",
-            "home.dest": "object",
+            "home.dest": str_dtype,
         }
         for col_name in data.columns:
             assert data[col_name].dtype.name == col_dtype[col_name]
@@ -357,7 +360,7 @@ def setUp(self):
     def test_get_sparse_dataset_dataframe_with_target(self):
         X, y, _, attribute_names = self.sparse_dataset.get_data(target="class")
         assert isinstance(X, pd.DataFrame)
-        assert isinstance(X.dtypes[0], pd.SparseDtype)
+        assert isinstance(X.dtypes.iloc[0], pd.SparseDtype)
         assert X.shape == (600, 20000)
 
         assert isinstance(y, pd.Series)
diff --git a/tests/test_flows/test_flow_functions.py b/tests/test_flows/test_flow_functions.py
@@ -41,8 +41,9 @@ def _check_flow(self, flow):
         assert isinstance(flow["full_name"], str)
         assert isinstance(flow["version"], str)
         # There are some runs on openml.org that can have an empty external version
+        ext_version = flow["external_version"]
         ext_version_str_or_none = (
-            isinstance(flow["external_version"], str) or flow["external_version"] is None
+            isinstance(ext_version, str) or ext_version is None or pd.isna(ext_version)
         )
         assert ext_version_str_or_none