Default to openpyxl not xlrd for read_excel

author Debian Science Team <debian-science-maintainers@lists.alioth.debian.org>

Mon, 7 Dec 2020 23:06:28 +0000 (23:06 +0000)

committer Rebecca N. Palmer <rebecca_palmer@zoho.com>

Mon, 7 Dec 2020 23:06:28 +0000 (23:06 +0000)
author Debian Science Team <debian-science-maintainers@lists.alioth.debian.org>
Mon, 7 Dec 2020 23:06:28 +0000 (23:06 +0000)
committer Rebecca N. Palmer <rebecca_palmer@zoho.com>
Mon, 7 Dec 2020 23:06:28 +0000 (23:06 +0000)
diff --git a/doc/source/whatsnew/v1.1.5.rst b/doc/source/whatsnew/v1.1.5.rst

index 002e1f85f41275960abdcfc494d2d1170014b566..736fb1187ed90523382b2a02a2a2a43201a4108f 100644 (file)
--- a/doc/source/whatsnew/v1.1.5.rst
+++ b/doc/source/whatsnew/v1.1.5.rst
@@ -8,6 +8,16 @@ including other versions of pandas.
  
  {{ header }}
  
+.. warning::
+
+   Previously, the default argument ``engine=None`` to ``pd.read_excel``
+   would result in using the `xlrd <https://xlrd.readthedocs.io/en/latest/>`_ engine in
+   many cases. The engine ``xlrd`` is no longer maintained, and may not work if ``defusedxml``
+   is installed.  Hence, from version 1.1.5 in Debian and 1.2.0 upstream,
+   if `openpyxl <https://pypi.org/project/openpyxl/>`_  is installed,
+   many of these  cases will now default to using the ``openpyxl`` engine. See the
+   :func:`read_excel` documentation for more details.
+
  .. ---------------------------------------------------------------------------
  
  .. _whatsnew_115.regressions:
diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py

index b1bbda4a4b7e0259ec5bd134669358fce8ca6ce9..8c7a8040b2d25f72b30f4600ec1142f754a44ff6 100644 (file)
--- a/pandas/io/excel/_base.py
+++ b/pandas/io/excel/_base.py
@@ -1,13 +1,16 @@
  import abc
  import datetime
+import inspect
  from io import BufferedIOBase, BytesIO, RawIOBase
  import os
  from textwrap import fill
  from typing import Union
+import warnings
  
  from pandas._config import config
  
  from pandas._libs.parsers import STR_NA_VALUES
+from pandas.compat._optional import import_optional_dependency
  from pandas.errors import EmptyDataError
  from pandas.util._decorators import Appender, deprecate_nonkeyword_arguments
  
@@ -104,12 +107,32 @@ dtype : Type name or dict of column -> type, default None
      of dtype conversion.
  engine : str, default None
      If io is not a buffer or path, this must be set to identify io.
-    Supported engines: "xlrd", "openpyxl", "odf", "pyxlsb", default "xlrd".
+    Supported engines: "xlrd", "openpyxl", "odf", "pyxlsb".
      Engine compatibility :
+
      - "xlrd" supports most old/new Excel file formats.
      - "openpyxl" supports newer Excel file formats.
      - "odf" supports OpenDocument file formats (.odf, .ods, .odt).
      - "pyxlsb" supports Binary Excel files.
+
+    .. versionchanged:: 1.1.5 in Debian, 1.2.0 upstream
+        The engine `xlrd <https://xlrd.readthedocs.io/en/latest/>`_
+        is no longer maintained, and is not supported with
+        python >= 3.9. When ``engine=None``, the following logic will be
+        used to determine the engine.
+
+        - If ``path_or_buffer`` is an OpenDocument format (.odf, .ods, .odt),
+          then `odf <https://pypi.org/project/odfpy/>`_ will be used.
+        - Otherwise if ``path_or_buffer`` is a bytes stream, the file has the
+          extension ``.xls``, or is an ``xlrd`` Book instance, then ``xlrd`` will
+          be used.
+        - Otherwise if `openpyxl <https://pypi.org/project/openpyxl/>`_ is installed,
+          then ``openpyxl`` will be used.
+        - Otherwise ``xlrd`` will be used and a ``FutureWarning`` will be raised.
+
+        Specifying ``engine="xlrd"`` will continue to be allowed for the
+        indefinite future, but may require uninstalling (python3-)defusedxml.
+
  converters : dict, default None
      Dict of functions for converting values in certain columns. Keys can
      either be integers or column labels, values are functions that take one
@@ -823,13 +846,32 @@ class ExcelFile:
          .xls, .xlsx, .xlsb, .xlsm, .odf, .ods, or .odt file.
      engine : str, default None
          If io is not a buffer or path, this must be set to identify io.
-        Supported engines: ``xlrd``, ``openpyxl``, ``odf``, ``pyxlsb``,
-        default ``xlrd``.
+        Supported engines: ``xlrd``, ``openpyxl``, ``odf``, ``pyxlsb``
          Engine compatibility :
+
          - ``xlrd`` supports most old/new Excel file formats.
          - ``openpyxl`` supports newer Excel file formats.
          - ``odf`` supports OpenDocument file formats (.odf, .ods, .odt).
          - ``pyxlsb`` supports Binary Excel files.
+
+        .. versionchanged:: 1.1.5 in Debian, 1.2.0 upstream
+
+           The engine `xlrd <https://xlrd.readthedocs.io/en/latest/>`_
+           is no longer maintained, and is not supported with
+           python >= 3.9. When ``engine=None``, the following logic will be
+           used to determine the engine.
+
+           - If ``path_or_buffer`` is an OpenDocument format (.odf, .ods, .odt),
+             then `odf <https://pypi.org/project/odfpy/>`_ will be used.
+           - Otherwise if ``path_or_buffer`` is a bytes stream, the file has the
+             extension ``.xls``, or is an ``xlrd`` Book instance, then ``xlrd``
+             will be used.
+           - Otherwise if `openpyxl <https://pypi.org/project/openpyxl/>`_ is installed,
+             then ``openpyxl`` will be used.
+           - Otherwise ``xlrd`` will be used and a ``FutureWarning`` will be raised.
+
+           Specifying ``engine="xlrd"`` will continue to be allowed for the
+           indefinite future, but may require uninstalling (python3-)defusedxml.
      """
  
      from pandas.io.excel._odfreader import _ODFReader
@@ -846,14 +888,59 @@ class ExcelFile:
  
      def __init__(self, path_or_buffer, engine=None):
          if engine is None:
-            engine = "xlrd"
+            # Determine ext and use odf for ods stream/file
              if isinstance(path_or_buffer, (BufferedIOBase, RawIOBase)):
+                ext = None
                  if _is_ods_stream(path_or_buffer):
                      engine = "odf"
              else:
                  ext = os.path.splitext(str(path_or_buffer))[-1]
                  if ext == ".ods":
                      engine = "odf"
+
+            if (
+                import_optional_dependency(
+                    "xlrd", raise_on_missing=False, on_version="ignore"
+                )
+                is not None
+            ):
+                from xlrd import Book
+
+                if isinstance(path_or_buffer, Book):
+                    engine = "xlrd"
+
+            # GH 35029 - Prefer openpyxl except for xls files
+            if engine is None:
+                if ext is None or isinstance(path_or_buffer, bytes) or ext == ".xls":
+                    engine = "xlrd"
+                elif (
+                    import_optional_dependency(
+                        "openpyxl", raise_on_missing=False, on_version="ignore"
+                    )
+                    is not None
+                ):
+                    engine = "openpyxl"
+                else:
+                    caller = inspect.stack()[1]
+                    if (
+                        caller.filename.endswith("pandas/io/excel/_base.py")
+                        and caller.function == "read_excel"
+                    ):
+                        stacklevel = 4
+                    else:
+                        stacklevel = 2
+                    warnings.warn(
+                        "The xlrd engine is no longer maintained and is not "
+                        "supported when using pandas with python >= 3.9. However, "
+                        "the engine xlrd will continue to be allowed for the "
+                        "indefinite future. The "
+                        "openpyxl engine will be used if it is installed and the "
+                        "engine argument is not specified. Either install openpyxl "
+                        "or specify engine='xlrd' to silence this warning.",
+                        FutureWarning,
+                        stacklevel=stacklevel,
+                    )
+                    engine = "xlrd"
          if engine not in self._engines:
              raise ValueError(f"Unknown engine: {engine}")
  
diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py

index e9ea1363a9d3cfd71e3cfb53e8f33cbe52ea5e8c..626b26b97d2f21954737b76de5b257534cc600f7 100644 (file)
--- a/pandas/tests/io/excel/test_readers.py
+++ b/pandas/tests/io/excel/test_readers.py
@@ -599,6 +599,10 @@ class TestReaders:
          if pd.read_excel.keywords["engine"] == "openpyxl":
              pytest.xfail("Maybe not supported by openpyxl")
  
+        if pd.read_excel.keywords["engine"] is None:
+            # GH 35029
+            pytest.xfail("Defaults to openpyxl, maybe not supported")
+
          result = pd.read_excel("testdateoverflow" + read_ext)
          tm.assert_frame_equal(result, expected)
  
@@ -1153,12 +1157,13 @@ class TestExcelFileRead:
          actual = pd.read_excel(data, engine=engine)
          tm.assert_frame_equal(expected, actual)
  
+    @td.skip_if_no("xlrd")
      def test_excel_high_surrogate(self, engine):
          # GH 23809
          expected = pd.DataFrame(["\udc88"], columns=["Column1"])
  
          # should not produce a segmentation violation
-        actual = pd.read_excel("high_surrogate.xlsx")
+        actual = pd.read_excel("high_surrogate.xlsx", engine="xlrd")
          tm.assert_frame_equal(expected, actual)
  
      @pytest.mark.parametrize("filename", ["df_empty.xlsx", "df_equals.xlsx"])
diff --git a/pandas/tests/io/excel/test_writers.py b/pandas/tests/io/excel/test_writers.py

index e3ee53b63e102a6828bd36521894181b2788255d..8197c851514d89f9cac7fa84fa6b432ae2339500 100644 (file)
--- a/pandas/tests/io/excel/test_writers.py
+++ b/pandas/tests/io/excel/test_writers.py
@@ -351,12 +351,15 @@ class TestExcelWriter:
              msg = "sheet 0 not found"
              with pytest.raises(ValueError, match=msg):
                  pd.read_excel(xl, "0")
-        else:
+        elif engine == "xlwt":
              import xlrd
  
              msg = "No sheet named <'0'>"
              with pytest.raises(xlrd.XLRDError, match=msg):
                  pd.read_excel(xl, sheet_name="0")
+        else:
+            with pytest.raises(KeyError, match="Worksheet 0 does not exist."):
+                pd.read_excel(xl, sheet_name="0")
  
      def test_excel_writer_context_manager(self, frame, path):
          with ExcelWriter(path) as writer:
@@ -1195,7 +1198,9 @@ class TestExcelWriter:
  
          write_frame = DataFrame({"A": datetimes})
          write_frame.to_excel(path, "Sheet1")
-        read_frame = pd.read_excel(path, sheet_name="Sheet1", header=0)
+        # GH 35029 - Default changed to openpyxl, but test is for odf/xlrd
+        engine = "odf" if path.endswith("ods") else "xlrd"
+        read_frame = pd.read_excel(path, sheet_name="Sheet1", header=0, engine=engine)
  
          tm.assert_series_equal(write_frame["A"], read_frame["A"])
  
diff --git a/pandas/tests/io/excel/test_xlrd.py b/pandas/tests/io/excel/test_xlrd.py

index 1c9c514b20f4624aa436cc8588b1df8b3fc9e930..aac4f4f13dc43af25ba290df8e056f08077c6cde 100644 (file)
--- a/pandas/tests/io/excel/test_xlrd.py
+++ b/pandas/tests/io/excel/test_xlrd.py
@@ -1,5 +1,7 @@
  import pytest
  
+from pandas.compat._optional import import_optional_dependency
+
  import pandas as pd
  import pandas._testing as tm
  
@@ -38,6 +40,48 @@ def test_read_xlrd_book(read_ext, frame):
  # TODO: test for openpyxl as well
  def test_excel_table_sheet_by_index(datapath, read_ext):
      path = datapath("io", "data", "excel", f"test1{read_ext}")
-    with pd.ExcelFile(path) as excel:
+    with pd.ExcelFile(path, engine="xlrd") as excel:
          with pytest.raises(xlrd.XLRDError):
              pd.read_excel(excel, sheet_name="asdf")
+
+
+def test_excel_file_warning_with_xlsx_file(datapath):
+    # GH 29375
+    path = datapath("io", "data", "excel", "test1.xlsx")
+    has_openpyxl = (
+        import_optional_dependency(
+            "openpyxl", raise_on_missing=False, on_version="ignore"
+        )
+        is not None
+    )
+    if not has_openpyxl:
+        with tm.assert_produces_warning(
+            FutureWarning,
+            raise_on_extra_warnings=False,
+            match="The xlrd engine is no longer maintained",
+        ):
+            ExcelFile(path, engine=None)
+    else:
+        with tm.assert_produces_warning(None):
+            pd.read_excel(path, "Sheet1", engine=None)
+
+
+def test_read_excel_warning_with_xlsx_file(tmpdir, datapath):
+    # GH 29375
+    path = datapath("io", "data", "excel", "test1.xlsx")
+    has_openpyxl = (
+        import_optional_dependency(
+            "openpyxl", raise_on_missing=False, on_version="ignore"
+        )
+        is not None
+    )
+    if not has_openpyxl:
+        with tm.assert_produces_warning(
+            FutureWarning,
+            raise_on_extra_warnings=False,
+            match="The xlrd engine is no longer maintained",
+        ):
+            pd.read_excel(path, "Sheet1", engine=None)
+    else:
+        with tm.assert_produces_warning(None):
+            pd.read_excel(path, "Sheet1", engine=None)
author	Debian Science Team <debian-science-maintainers@lists.alioth.debian.org>
	Mon, 7 Dec 2020 23:06:28 +0000 (23:06 +0000)
committer	Rebecca N. Palmer <rebecca_palmer@zoho.com>
	Mon, 7 Dec 2020 23:06:28 +0000 (23:06 +0000)
doc/source/whatsnew/v1.1.5.rst		patch \| blob \| history
pandas/io/excel/_base.py		patch \| blob \| history
pandas/tests/io/excel/test_readers.py		patch \| blob \| history
pandas/tests/io/excel/test_writers.py		patch \| blob \| history
pandas/tests/io/excel/test_xlrd.py		patch \| blob \| history