From: Debian Science Team Date: Mon, 7 Dec 2020 23:06:28 +0000 (+0000) Subject: Default to openpyxl not xlrd for read_excel X-Git-Tag: archive/raspbian/1.5.3+dfsg-2+rpi1~1^2^2^2^2^2^2^2^2^2^2^2~2 X-Git-Url: https://dgit.raspbian.org/?a=commitdiff_plain;h=ea08ae2c5b2a48a35cf51a11130c13207c03ae5d;p=pandas.git Default to openpyxl not xlrd for read_excel xlrd 1.2 fails if defusedxml (needed for odf) is installed Bug: https://github.com/pandas-dev/pandas/pull/35029 Bug-Debian: https://bugs.debian.org/976620 Origin: upstream b3a3932af6aafaa2fd41f17e9b7995643e5f92eb Author: Robert de Vries, Rebecca N. Palmer Forwarded: not-needed Gbp-Pq: Name xlrd_976620.patch --- diff --git a/doc/source/whatsnew/v1.1.5.rst b/doc/source/whatsnew/v1.1.5.rst index 002e1f85..736fb118 100644 --- a/doc/source/whatsnew/v1.1.5.rst +++ b/doc/source/whatsnew/v1.1.5.rst @@ -8,6 +8,16 @@ including other versions of pandas. {{ header }} +.. warning:: + + Previously, the default argument ``engine=None`` to ``pd.read_excel`` + would result in using the `xlrd `_ engine in + many cases. The engine ``xlrd`` is no longer maintained, and may not work if ``defusedxml`` + is installed. Hence, from version 1.1.5 in Debian and 1.2.0 upstream, + if `openpyxl `_ is installed, + many of these cases will now default to using the ``openpyxl`` engine. See the + :func:`read_excel` documentation for more details. + .. --------------------------------------------------------------------------- .. _whatsnew_115.regressions: diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index b1bbda4a..8c7a8040 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -1,13 +1,16 @@ import abc import datetime +import inspect from io import BufferedIOBase, BytesIO, RawIOBase import os from textwrap import fill from typing import Union +import warnings from pandas._config import config from pandas._libs.parsers import STR_NA_VALUES +from pandas.compat._optional import import_optional_dependency from pandas.errors import EmptyDataError from pandas.util._decorators import Appender, deprecate_nonkeyword_arguments @@ -104,12 +107,32 @@ dtype : Type name or dict of column -> type, default None of dtype conversion. engine : str, default None If io is not a buffer or path, this must be set to identify io. - Supported engines: "xlrd", "openpyxl", "odf", "pyxlsb", default "xlrd". + Supported engines: "xlrd", "openpyxl", "odf", "pyxlsb". Engine compatibility : + - "xlrd" supports most old/new Excel file formats. - "openpyxl" supports newer Excel file formats. - "odf" supports OpenDocument file formats (.odf, .ods, .odt). - "pyxlsb" supports Binary Excel files. + + .. versionchanged:: 1.1.5 in Debian, 1.2.0 upstream + The engine `xlrd `_ + is no longer maintained, and is not supported with + python >= 3.9. When ``engine=None``, the following logic will be + used to determine the engine. + + - If ``path_or_buffer`` is an OpenDocument format (.odf, .ods, .odt), + then `odf `_ will be used. + - Otherwise if ``path_or_buffer`` is a bytes stream, the file has the + extension ``.xls``, or is an ``xlrd`` Book instance, then ``xlrd`` will + be used. + - Otherwise if `openpyxl `_ is installed, + then ``openpyxl`` will be used. + - Otherwise ``xlrd`` will be used and a ``FutureWarning`` will be raised. + + Specifying ``engine="xlrd"`` will continue to be allowed for the + indefinite future, but may require uninstalling (python3-)defusedxml. + converters : dict, default None Dict of functions for converting values in certain columns. Keys can either be integers or column labels, values are functions that take one @@ -823,13 +846,32 @@ class ExcelFile: .xls, .xlsx, .xlsb, .xlsm, .odf, .ods, or .odt file. engine : str, default None If io is not a buffer or path, this must be set to identify io. - Supported engines: ``xlrd``, ``openpyxl``, ``odf``, ``pyxlsb``, - default ``xlrd``. + Supported engines: ``xlrd``, ``openpyxl``, ``odf``, ``pyxlsb`` Engine compatibility : + - ``xlrd`` supports most old/new Excel file formats. - ``openpyxl`` supports newer Excel file formats. - ``odf`` supports OpenDocument file formats (.odf, .ods, .odt). - ``pyxlsb`` supports Binary Excel files. + + .. versionchanged:: 1.1.5 in Debian, 1.2.0 upstream + + The engine `xlrd `_ + is no longer maintained, and is not supported with + python >= 3.9. When ``engine=None``, the following logic will be + used to determine the engine. + + - If ``path_or_buffer`` is an OpenDocument format (.odf, .ods, .odt), + then `odf `_ will be used. + - Otherwise if ``path_or_buffer`` is a bytes stream, the file has the + extension ``.xls``, or is an ``xlrd`` Book instance, then ``xlrd`` + will be used. + - Otherwise if `openpyxl `_ is installed, + then ``openpyxl`` will be used. + - Otherwise ``xlrd`` will be used and a ``FutureWarning`` will be raised. + + Specifying ``engine="xlrd"`` will continue to be allowed for the + indefinite future, but may require uninstalling (python3-)defusedxml. """ from pandas.io.excel._odfreader import _ODFReader @@ -846,14 +888,59 @@ class ExcelFile: def __init__(self, path_or_buffer, engine=None): if engine is None: - engine = "xlrd" + # Determine ext and use odf for ods stream/file if isinstance(path_or_buffer, (BufferedIOBase, RawIOBase)): + ext = None if _is_ods_stream(path_or_buffer): engine = "odf" else: ext = os.path.splitext(str(path_or_buffer))[-1] if ext == ".ods": engine = "odf" + + if ( + import_optional_dependency( + "xlrd", raise_on_missing=False, on_version="ignore" + ) + is not None + ): + from xlrd import Book + + if isinstance(path_or_buffer, Book): + engine = "xlrd" + + # GH 35029 - Prefer openpyxl except for xls files + if engine is None: + if ext is None or isinstance(path_or_buffer, bytes) or ext == ".xls": + engine = "xlrd" + elif ( + import_optional_dependency( + "openpyxl", raise_on_missing=False, on_version="ignore" + ) + is not None + ): + engine = "openpyxl" + else: + caller = inspect.stack()[1] + if ( + caller.filename.endswith("pandas/io/excel/_base.py") + and caller.function == "read_excel" + ): + stacklevel = 4 + else: + stacklevel = 2 + warnings.warn( + "The xlrd engine is no longer maintained and is not " + "supported when using pandas with python >= 3.9. However, " + "the engine xlrd will continue to be allowed for the " + "indefinite future. The " + "openpyxl engine will be used if it is installed and the " + "engine argument is not specified. Either install openpyxl " + "or specify engine='xlrd' to silence this warning.", + FutureWarning, + stacklevel=stacklevel, + ) + engine = "xlrd" if engine not in self._engines: raise ValueError(f"Unknown engine: {engine}") diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index e9ea1363..626b26b9 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -599,6 +599,10 @@ class TestReaders: if pd.read_excel.keywords["engine"] == "openpyxl": pytest.xfail("Maybe not supported by openpyxl") + if pd.read_excel.keywords["engine"] is None: + # GH 35029 + pytest.xfail("Defaults to openpyxl, maybe not supported") + result = pd.read_excel("testdateoverflow" + read_ext) tm.assert_frame_equal(result, expected) @@ -1153,12 +1157,13 @@ class TestExcelFileRead: actual = pd.read_excel(data, engine=engine) tm.assert_frame_equal(expected, actual) + @td.skip_if_no("xlrd") def test_excel_high_surrogate(self, engine): # GH 23809 expected = pd.DataFrame(["\udc88"], columns=["Column1"]) # should not produce a segmentation violation - actual = pd.read_excel("high_surrogate.xlsx") + actual = pd.read_excel("high_surrogate.xlsx", engine="xlrd") tm.assert_frame_equal(expected, actual) @pytest.mark.parametrize("filename", ["df_empty.xlsx", "df_equals.xlsx"]) diff --git a/pandas/tests/io/excel/test_writers.py b/pandas/tests/io/excel/test_writers.py index e3ee53b6..8197c851 100644 --- a/pandas/tests/io/excel/test_writers.py +++ b/pandas/tests/io/excel/test_writers.py @@ -351,12 +351,15 @@ class TestExcelWriter: msg = "sheet 0 not found" with pytest.raises(ValueError, match=msg): pd.read_excel(xl, "0") - else: + elif engine == "xlwt": import xlrd msg = "No sheet named <'0'>" with pytest.raises(xlrd.XLRDError, match=msg): pd.read_excel(xl, sheet_name="0") + else: + with pytest.raises(KeyError, match="Worksheet 0 does not exist."): + pd.read_excel(xl, sheet_name="0") def test_excel_writer_context_manager(self, frame, path): with ExcelWriter(path) as writer: @@ -1195,7 +1198,9 @@ class TestExcelWriter: write_frame = DataFrame({"A": datetimes}) write_frame.to_excel(path, "Sheet1") - read_frame = pd.read_excel(path, sheet_name="Sheet1", header=0) + # GH 35029 - Default changed to openpyxl, but test is for odf/xlrd + engine = "odf" if path.endswith("ods") else "xlrd" + read_frame = pd.read_excel(path, sheet_name="Sheet1", header=0, engine=engine) tm.assert_series_equal(write_frame["A"], read_frame["A"]) diff --git a/pandas/tests/io/excel/test_xlrd.py b/pandas/tests/io/excel/test_xlrd.py index 1c9c514b..aac4f4f1 100644 --- a/pandas/tests/io/excel/test_xlrd.py +++ b/pandas/tests/io/excel/test_xlrd.py @@ -1,5 +1,7 @@ import pytest +from pandas.compat._optional import import_optional_dependency + import pandas as pd import pandas._testing as tm @@ -38,6 +40,48 @@ def test_read_xlrd_book(read_ext, frame): # TODO: test for openpyxl as well def test_excel_table_sheet_by_index(datapath, read_ext): path = datapath("io", "data", "excel", f"test1{read_ext}") - with pd.ExcelFile(path) as excel: + with pd.ExcelFile(path, engine="xlrd") as excel: with pytest.raises(xlrd.XLRDError): pd.read_excel(excel, sheet_name="asdf") + + +def test_excel_file_warning_with_xlsx_file(datapath): + # GH 29375 + path = datapath("io", "data", "excel", "test1.xlsx") + has_openpyxl = ( + import_optional_dependency( + "openpyxl", raise_on_missing=False, on_version="ignore" + ) + is not None + ) + if not has_openpyxl: + with tm.assert_produces_warning( + FutureWarning, + raise_on_extra_warnings=False, + match="The xlrd engine is no longer maintained", + ): + ExcelFile(path, engine=None) + else: + with tm.assert_produces_warning(None): + pd.read_excel(path, "Sheet1", engine=None) + + +def test_read_excel_warning_with_xlsx_file(tmpdir, datapath): + # GH 29375 + path = datapath("io", "data", "excel", "test1.xlsx") + has_openpyxl = ( + import_optional_dependency( + "openpyxl", raise_on_missing=False, on_version="ignore" + ) + is not None + ) + if not has_openpyxl: + with tm.assert_produces_warning( + FutureWarning, + raise_on_extra_warnings=False, + match="The xlrd engine is no longer maintained", + ): + pd.read_excel(path, "Sheet1", engine=None) + else: + with tm.assert_produces_warning(None): + pd.read_excel(path, "Sheet1", engine=None)