HDF5 and Stata I/O are broken on some architectures

author Debian Science Team <debian-science-maintainers@lists.alioth.debian.org>

Mon, 21 Feb 2022 07:35:51 +0000 (07:35 +0000)

committer Rebecca N. Palmer <rebecca_palmer@zoho.com>

Mon, 21 Feb 2022 07:35:51 +0000 (07:35 +0000)
author Debian Science Team <debian-science-maintainers@lists.alioth.debian.org>
Mon, 21 Feb 2022 07:35:51 +0000 (07:35 +0000)
committer Rebecca N. Palmer <rebecca_palmer@zoho.com>
Mon, 21 Feb 2022 07:35:51 +0000 (07:35 +0000)
diff --git a/pandas/_testing/_warnings.py b/pandas/_testing/_warnings.py

index 5153118e9b142237522de42a65388e5d384ecf03..226d28dc0c17860167f3915b2b8a369517d395fc 100644 (file)
--- a/pandas/_testing/_warnings.py
+++ b/pandas/_testing/_warnings.py
@@ -8,6 +8,7 @@ from typing import (
      cast,
  )
  import warnings
+import platform
  
  
  @contextmanager
@@ -149,6 +150,8 @@ def _assert_caught_no_extra_warnings(
                  # FIXME: kludge because pytest.filterwarnings does not
                  #  suppress these, xref GH#38630
                  continue
+            if actual_warning.category==UserWarning and "Non-x86 system detected" in str(actual_warning.message) and not bool(re.match('i.?86|x86',platform.uname()[4])):
+                continue
  
              extra_warnings.append(
                  (
diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py

index 1b4bd62ee7db70be437f9c264ca4b86129b4bb41..682d2f85b7cbc44f308365c2a9aa06629b570904 100644 (file)
--- a/pandas/io/pytables.py
+++ b/pandas/io/pytables.py
@@ -23,6 +23,10 @@ from typing import (
      cast,
  )
  import warnings
+import platform
+import re
+from pandas.compat import is_platform_little_endian
+warn_hdf_platform = "Non-x86 system detected, HDF(5) format I/O may give wrong results (particularly on files created with older versions) or crash - https://bugs.debian.org/877419" if not bool(re.match('i.?86|x86',platform.uname()[4])) else False
  
  import numpy as np
  
@@ -565,6 +569,8 @@ class HDFStore:
          fletcher32: bool = False,
          **kwargs,
      ):
+        if warn_hdf_platform:
+            warnings.warn(warn_hdf_platform)
  
          if "format" in kwargs:
              raise ValueError("format is not a defined argument for HDFStore")
@@ -776,7 +782,10 @@ class HDFStore:
              self._handle.flush()
              if fsync:
                  with suppress(OSError):
-                    os.fsync(self._handle.fileno())
+                    if is_platform_little_endian():
+                        os.fsync(self._handle.fileno())
+                    else:
+                        os.sync() # due to a pytables bad-cast bug, fileno is invalid on 64-bit big-endian
  
      def get(self, key: str):
          """
diff --git a/pandas/io/stata.py b/pandas/io/stata.py

index 20e035891a625f5c2eca29cd18141d513d8712f8..c857bd17746df2d10ff4e034463b292fa7ee3993 100644 (file)
--- a/pandas/io/stata.py
+++ b/pandas/io/stata.py
@@ -25,6 +25,9 @@ from typing import (
      cast,
  )
  import warnings
+import platform
+import re
+warn_stata_platform = "Non-x86 system detected, Stata format I/O may give wrong results (particularly on strings) - https://bugs.debian.org/877419" if not bool(re.match('i.?86|x86',platform.uname()[4])) else False
  
  from dateutil.relativedelta import relativedelta
  import numpy as np
@@ -896,6 +899,8 @@ class StataParser:
          # NOTE: the byte type seems to be reserved for categorical variables
          # with a label, but the underlying variable is -127 to 100
          # we're going to drop the label and cast to int
+        if warn_stata_platform:
+            warnings.warn(warn_stata_platform)
          self.DTYPE_MAP = dict(
              list(zip(range(1, 245), [np.dtype("a" + str(i)) for i in range(1, 245)]))
              + [
diff --git a/pandas/tests/io/pytables/test_append.py b/pandas/tests/io/pytables/test_append.py

index b5f9e6e74ece92ee99c692c32d91462ad00002e7..bfffcde3c516f66fb663393361d53b883139731e 100644 (file)
--- a/pandas/tests/io/pytables/test_append.py
+++ b/pandas/tests/io/pytables/test_append.py
@@ -24,6 +24,10 @@ from pandas.tests.io.pytables.common import (
      ensure_clean_path,
      ensure_clean_store,
  )
+import platform
+import re
+import sys
+is_crashing_arch=bool((platform.uname()[4].startswith('arm') or platform.uname()[4].startswith('aarch')) and sys.maxsize<2**33) # meant for armhf, though this form will also skip on armel - uname = kernel arch
  
  pytestmark = pytest.mark.single
  
@@ -277,6 +281,7 @@ def test_append_all_nans(setup_path):
          tm.assert_frame_equal(store["df2"], df)
  
  
+@pytest.mark.xfail(condition=is_crashing_arch,reason="https://bugs.debian.org/790925",strict=False,run=False)
  def test_append_frame_column_oriented(setup_path):
      with ensure_clean_store(setup_path) as store:
  
diff --git a/pandas/tests/io/pytables/test_file_handling.py b/pandas/tests/io/pytables/test_file_handling.py

index 88e2b5f080282b59be11f00a558ad6dfe7d0695d..ed2e8615d5df5d2569da1bd24d2dfaabea5dc9cf 100644 (file)
--- a/pandas/tests/io/pytables/test_file_handling.py
+++ b/pandas/tests/io/pytables/test_file_handling.py
@@ -25,6 +25,10 @@ from pandas.io.pytables import (
      PossibleDataLossError,
      Term,
  )
+import platform
+import re
+import sys
+is_crashing_arch=bool((platform.uname()[4].startswith('arm') or platform.uname()[4].startswith('aarch')) and sys.maxsize<2**33) # meant for armhf, though this form will also skip on armel - uname = kernel arch
  
  pytestmark = pytest.mark.single
  
@@ -270,6 +274,7 @@ def test_complibs(setup_path):
              h5table.close()
  
  
+@pytest.mark.xfail(condition=is_crashing_arch,reason="https://bugs.debian.org/790925",strict=False,run=False)
  @pytest.mark.skipif(
      not is_platform_little_endian(), reason="reason platform is not little endian"
  )
@@ -303,6 +308,7 @@ def test_encoding(setup_path):
      ],
  )
  @pytest.mark.parametrize("dtype", ["category", object])
+@pytest.mark.xfail(condition=is_crashing_arch,reason="https://bugs.debian.org/790925",strict=False,run=False)
  def test_latin_encoding(setup_path, dtype, val):
      enc = "latin-1"
      nan_rep = ""
diff --git a/pandas/tests/io/pytables/test_read.py b/pandas/tests/io/pytables/test_read.py

index 1c9e63c66aadbe1cdd3e01105fccc68991b56dc2..60dcf01871a3dc0d755407d1684a10125d82e9fe 100644 (file)
--- a/pandas/tests/io/pytables/test_read.py
+++ b/pandas/tests/io/pytables/test_read.py
@@ -5,7 +5,7 @@ import numpy as np
  import pytest
  
  from pandas._libs.tslibs import Timestamp
-from pandas.compat import is_platform_windows
+from pandas.compat import is_platform_windows, is_platform_little_endian
  
  import pandas as pd
  from pandas import (
@@ -155,6 +155,7 @@ def test_pytables_native2_read(datapath, setup_path):
          assert isinstance(d1, DataFrame)
  
  
+@pytest.mark.xfail(condition=not is_platform_little_endian(),reason="known failure of hdf on non-little endian",strict=False,raises=AttributeError)
  def test_legacy_table_fixed_format_read_py2(datapath, setup_path):
      # GH 24510
      # legacy table with fixed format written in Python 2
@@ -170,6 +171,7 @@ def test_legacy_table_fixed_format_read_py2(datapath, setup_path):
          tm.assert_frame_equal(expected, result)
  
  
+@pytest.mark.xfail(condition=not is_platform_little_endian(),reason="known failure of hdf on non-little endian",strict=False,raises=AttributeError)
  def test_legacy_table_fixed_format_read_datetime_py2(datapath, setup_path):
      # GH 31750
      # legacy table with fixed format and datetime64 column written in Python 2
@@ -319,6 +321,7 @@ def test_read_hdf_series_mode_r(format, setup_path):
      tm.assert_series_equal(result, series)
  
  
+@pytest.mark.xfail(condition=not is_platform_little_endian(),reason="known failure of hdf on non-little endian",strict=False,raises=AttributeError)
  def test_read_py2_hdf_file_in_py3(datapath):
      # GH 16781
  
diff --git a/pandas/tests/io/pytables/test_store.py b/pandas/tests/io/pytables/test_store.py

index 856a2ca15ec4a93b2fda73041ed96f3fe98f0f0f..07f5a00c2efc77477919817757d1fd02a23295ff 100644 (file)
--- a/pandas/tests/io/pytables/test_store.py
+++ b/pandas/tests/io/pytables/test_store.py
@@ -39,6 +39,10 @@ from pandas.io.pytables import (
      HDFStore,
      read_hdf,
  )
+import platform
+import re
+import sys
+is_crashing_arch=bool((platform.uname()[4].startswith('arm') or platform.uname()[4].startswith('aarch')) and sys.maxsize<2**33) # meant for armhf, though this form will also skip on armel - uname = kernel arch
  
  pytestmark = pytest.mark.single
  
@@ -789,6 +793,7 @@ def test_start_stop_fixed(setup_path):
          df.iloc[8:10, -2] = np.nan
  
  
+@pytest.mark.xfail(condition=is_crashing_arch,reason="https://bugs.debian.org/790925",strict=False,run=False)
  def test_select_filter_corner(setup_path):
  
      df = DataFrame(np.random.randn(50, 100))
diff --git a/pandas/tests/io/pytables/test_timezones.py b/pandas/tests/io/pytables/test_timezones.py

index 36fa79d0bb7e34b87cbb857ed3440dd699e76401..b49bc884e26289264f5ffd690b73a30ffab7cfb9 100644 (file)
--- a/pandas/tests/io/pytables/test_timezones.py
+++ b/pandas/tests/io/pytables/test_timezones.py
@@ -8,6 +8,7 @@ import pytest
  
  from pandas._libs.tslibs.timezones import maybe_get_tz
  import pandas.util._test_decorators as td
+from pandas.compat import is_platform_little_endian
  
  import pandas as pd
  from pandas import (
@@ -304,6 +305,7 @@ def test_store_timezone(setup_path):
          tm.assert_frame_equal(result, df)
  
  
+@pytest.mark.xfail(condition=not is_platform_little_endian(),reason="known failure of hdf on non-little endian",strict=False,raises=AttributeError)
  def test_legacy_datetimetz_object(datapath, setup_path):
      # legacy from < 0.17.0
      # 8260
@@ -356,6 +358,7 @@ def test_read_with_where_tz_aware_index(setup_path):
          tm.assert_frame_equal(result, expected)
  
  
+@pytest.mark.xfail(condition=not is_platform_little_endian(),reason="known failure of hdf on non-little endian",strict=False,raises=AttributeError)
  def test_py2_created_with_datetimez(datapath, setup_path):
      # The test HDF5 file was created in Python 2, but could not be read in
      # Python 3.
diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py

index d52ea01ac35dec2bfbc2fdec99a0a7a36411a055..41b21af147087d375aaed1b945a2efba9f1a12e2 100644 (file)
--- a/pandas/tests/io/test_common.py
+++ b/pandas/tests/io/test_common.py
@@ -15,7 +15,7 @@ import tempfile
  
  import pytest
  
-from pandas.compat import is_platform_windows
+from pandas.compat import is_platform_windows, is_platform_little_endian
  import pandas.util._test_decorators as td
  
  import pandas as pd
@@ -244,11 +244,11 @@ bar2,12,13,14,15
                  "pyarrow",
                  ("io", "data", "feather", "feather-0_3_1.feather"),
              ),
-            (
+            pytest.param(
                  pd.read_hdf,
                  "tables",
                  ("io", "data", "legacy_hdf", "datetimetz_object.h5"),
-            ),
+            marks=pytest.mark.xfail(condition=not is_platform_little_endian(),reason="known failure of hdf on non-little endian",strict=False,raises=AttributeError)),
              (pd.read_stata, "os", ("io", "data", "stata", "stata10_115.dta")),
              (pd.read_sas, "os", ("io", "sas", "data", "test1.sas7bdat")),
              (pd.read_json, "os", ("io", "json", "data", "tsframe_v012.json")),
diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py

index 3ba5835331fe5b497e1113204372df93ccfaae19..979a78106dc1cd575565700cdc721a60b4c4a460 100644 (file)
--- a/pandas/tests/io/test_stata.py
+++ b/pandas/tests/io/test_stata.py
@@ -33,6 +33,8 @@ from pandas.io.stata import (
      read_stata,
  )
  
+from pandas.compat import is_platform_little_endian
+pytestmark = pytest.mark.xfail(condition=not is_platform_little_endian(),reason="known failure of test_stata on non-little endian",strict=False)
  
  @pytest.fixture()
  def mixed_frame():
@@ -207,7 +209,7 @@ class TestStata:
              # parsed_113 = self.read_dta(self.dta2_113)
  
              # Remove resource warnings
-            w = [x for x in w if x.category is UserWarning]
+            w = [x for x in w if x.category is UserWarning and not "Non-x86 system detected" in str(x.message)]
  
              # should get warning for each call to read_dta
              assert len(w) == 3
@@ -469,7 +471,7 @@ class TestStata:
                  warnings.simplefilter("always", InvalidColumnName)
                  original.to_stata(path, None, version=version)
                  # should get a warning for that format.
-                assert len(w) == 1
+                assert len([x for x in w if not "Non-x86 system detected" in str(x.message)]) == 1
  
              written_and_read_again = self.read_dta(path)
              tm.assert_frame_equal(written_and_read_again.set_index("index"), formatted)
@@ -1793,8 +1795,9 @@ has been incorrectly encoded by Stata or some other software. You should verify
  the string values returned are correct."""
          with tm.assert_produces_warning(UnicodeWarning) as w:
              encoded = read_stata(self.dta_encoding_118)
-            assert len(w) == 151
-            assert w[0].message.args[0] == msg
+            w2 = [x for x in w if not "Non-x86 system detected" in str(x.message)]
+            assert len(w2) == 151
+            assert w2[0].message.args[0] == msg
  
          expected = DataFrame([["Düsseldorf"]] * 151, columns=["kreis1849"])
          tm.assert_frame_equal(encoded, expected)
author	Debian Science Team <debian-science-maintainers@lists.alioth.debian.org>
	Mon, 21 Feb 2022 07:35:51 +0000 (07:35 +0000)
committer	Rebecca N. Palmer <rebecca_palmer@zoho.com>
	Mon, 21 Feb 2022 07:35:51 +0000 (07:35 +0000)
pandas/_testing/_warnings.py		patch \| blob \| history
pandas/io/pytables.py		patch \| blob \| history
pandas/io/stata.py		patch \| blob \| history
pandas/tests/io/pytables/test_append.py		patch \| blob \| history
pandas/tests/io/pytables/test_file_handling.py		patch \| blob \| history
pandas/tests/io/pytables/test_read.py		patch \| blob \| history
pandas/tests/io/pytables/test_store.py		patch \| blob \| history
pandas/tests/io/pytables/test_timezones.py		patch \| blob \| history
pandas/tests/io/test_common.py		patch \| blob \| history
pandas/tests/io/test_stata.py		patch \| blob \| history