HDF5 and Stata I/O are broken on some architectures
authorDebian Science Team <debian-science-maintainers@lists.alioth.debian.org>
Wed, 26 Aug 2020 21:34:50 +0000 (22:34 +0100)
committerRebecca N. Palmer <rebecca_palmer@zoho.com>
Wed, 26 Aug 2020 21:34:50 +0000 (22:34 +0100)
Fix some issues, warn on use and xfail tests for the remainder

HDF5 and Stata are known to fail on big-endian architectures
Stata also fails on qemu-ppc64el, but not real ppc64el

In 0.25.3 HDF5 _crashes_ on armhf, so skip
(pytest-forked allows continuing past a crash,
but still seems to fail on xfailed crashes)

Author: Andreas Tille <tille@debian.org>, Graham Inggs <ginggs@debian.org>, Yaroslav Halchenko <debian@onerussian.com>, Rebecca N. Palmer <rebecca_palmer@zoho.com>
Bug-Debian: https://bugs.debian.org/877419
Forwarded: no

Gbp-Pq: Name xfail_tests_nonintel_io.patch

pandas/_testing.py
pandas/io/pytables.py
pandas/io/stata.py
pandas/tests/io/pytables/test_store.py
pandas/tests/io/pytables/test_timezones.py
pandas/tests/io/test_common.py
pandas/tests/io/test_stata.py

index ca378e5ce8f77cc3a30d9bde5250315cef8154c8..6ff48ccb515857cd27658ed913d303479974c89d 100644 (file)
@@ -5,6 +5,8 @@ from datetime import datetime
 from functools import wraps
 import gzip
 import os
+import platform
+import re
 from shutil import rmtree
 import string
 import tempfile
@@ -2481,6 +2483,8 @@ def assert_produces_warning(
                     )
                     assert actual_warning.filename == caller.filename, msg
             else:
+                if actual_warning.category==UserWarning and "Non-x86 system detected" in str(actual_warning.message) and not bool(re.match('i.?86|x86',platform.uname()[4])):
+                    continue
                 extra_warnings.append(
                     (
                         actual_warning.category.__name__,
index 4f12c0225bd2d6e5eac72135e611ee4067e91cea..d4b4ea67a0bcf706f29284a073e79ae3ce31a8cc 100644 (file)
@@ -20,6 +20,10 @@ from typing import (
     Union,
 )
 import warnings
+import platform
+import re
+from pandas.compat import is_platform_little_endian
+warn_hdf_platform = "Non-x86 system detected, HDF(5) format I/O may give wrong results (particularly on files created with older versions) or crash - https://bugs.debian.org/877419" if not bool(re.match('i.?86|x86',platform.uname()[4])) else False
 
 import numpy as np
 
@@ -511,6 +515,8 @@ class HDFStore:
         fletcher32: bool = False,
         **kwargs,
     ):
+        if warn_hdf_platform:
+            warnings.warn(warn_hdf_platform)
 
         if "format" in kwargs:
             raise ValueError("format is not a defined argument for HDFStore")
@@ -724,7 +730,10 @@ class HDFStore:
             self._handle.flush()
             if fsync:
                 try:
-                    os.fsync(self._handle.fileno())
+                    if is_platform_little_endian():
+                        os.fsync(self._handle.fileno())
+                    else:
+                        os.sync() # due to a pytables bad-cast bug, fileno is invalid on 64-bit big-endian
                 except OSError:
                     pass
 
index a7246655f490a6885c17efddb12b1d426aed5028..77368c0bb24a6dbcfab6486c908affda4f3b5c63 100644 (file)
@@ -17,6 +17,9 @@ import struct
 import sys
 from typing import Any, Dict, Hashable, Optional, Sequence
 import warnings
+import platform
+import re
+warn_stata_platform = "Non-x86 system detected, Stata format I/O may give wrong results (particularly on strings) - https://bugs.debian.org/877419" if not bool(re.match('i.?86|x86',platform.uname()[4])) else False
 
 from dateutil.relativedelta import relativedelta
 import numpy as np
@@ -855,6 +858,8 @@ class StataParser:
         # NOTE: the byte type seems to be reserved for categorical variables
         # with a label, but the underlying variable is -127 to 100
         # we're going to drop the label and cast to int
+        if warn_stata_platform:
+            warnings.warn(warn_stata_platform)
         self.DTYPE_MAP = dict(
             list(zip(range(1, 245), ["a" + str(i) for i in range(1, 245)]))
             + [
index f56d042093886fb719865ca134095b976e64b87f..0d906f4c4a7e2d368d52b4c6ad589c140c7a332f 100644 (file)
@@ -54,6 +54,11 @@ from pandas.io.pytables import (
 
 from pandas.io import pytables as pytables  # noqa: E402 isort:skip
 from pandas.io.pytables import TableIterator  # noqa: E402 isort:skip
+import platform
+import re
+import sys
+is_crashing_arch=bool((platform.uname()[4].startswith('arm') or platform.uname()[4].startswith('aarch')) and sys.maxsize<=2**32) # meant for armhf, though this form will also skip on armel - uname = kernel arch
+pytestmark = pytest.mark.forked
 
 
 _default_compressor = "blosc"
@@ -1013,6 +1018,7 @@ class TestHDFStore:
             check("table", index)
             check("fixed", index)
 
+    @pytest.mark.skipif(condition=is_crashing_arch,reason="https://bugs.debian.org/790925",strict=False)
     @pytest.mark.skipif(
         not is_platform_little_endian(), reason="reason platform is not little endian"
     )
@@ -1045,6 +1051,7 @@ class TestHDFStore:
         ],
     )
     @pytest.mark.parametrize("dtype", ["category", object])
+    @pytest.mark.skipif(condition=is_crashing_arch,reason="https://bugs.debian.org/790925",strict=False)
     def test_latin_encoding(self, setup_path, dtype, val):
         enc = "latin-1"
         nan_rep = ""
@@ -1241,6 +1248,7 @@ class TestHDFStore:
             # still read from it.
             pd.read_hdf(store, "k1")
 
+    @pytest.mark.skipif(condition=is_crashing_arch,reason="https://bugs.debian.org/790925",strict=False)
     def test_append_frame_column_oriented(self, setup_path):
         with ensure_clean_store(setup_path) as store:
 
@@ -3804,6 +3812,7 @@ class TestHDFStore:
             df.iloc[3:5, 1:3] = np.nan
             df.iloc[8:10, -2] = np.nan
 
+    @pytest.mark.skipif(condition=is_crashing_arch,reason="https://bugs.debian.org/790925",strict=False)
     def test_select_filter_corner(self, setup_path):
 
         df = DataFrame(np.random.randn(50, 100))
@@ -4060,6 +4069,7 @@ class TestHDFStore:
             assert isinstance(d1, DataFrame)
 
     @td.xfail_non_writeable
+    @pytest.mark.xfail(condition=not is_platform_little_endian(),reason="known failure of hdf on non-little endian",strict=False,raises=AttributeError)
     def test_legacy_table_fixed_format_read_py2(self, datapath, setup_path):
         # GH 24510
         # legacy table with fixed format written in Python 2
@@ -4740,6 +4750,7 @@ class TestHDFStore:
             with pd.HDFStore(path) as store:
                 assert os.fspath(store) == str(path)
 
+    @pytest.mark.xfail(condition=not is_platform_little_endian(),reason="known failure of hdf on non-little endian",strict=False,raises=AttributeError)
     def test_read_py2_hdf_file_in_py3(self, datapath):
         # GH 16781
 
index 2bf22d982e5fee45027547c5bba523d970016925..597510bb4ad348b651cd09fd26cef10f4fb77ee6 100644 (file)
@@ -4,6 +4,7 @@ import numpy as np
 import pytest
 
 import pandas.util._test_decorators as td
+from pandas.compat import is_platform_little_endian
 
 import pandas as pd
 from pandas import DataFrame, DatetimeIndex, Series, Timestamp, date_range
@@ -322,6 +323,7 @@ def test_store_timezone(setup_path):
         tm.assert_frame_equal(result, df)
 
 
+@pytest.mark.xfail(condition=not is_platform_little_endian(),reason="known failure of hdf on non-little endian",strict=False,raises=AttributeError)
 def test_legacy_datetimetz_object(datapath, setup_path):
     # legacy from < 0.17.0
     # 8260
@@ -372,6 +374,7 @@ def test_read_with_where_tz_aware_index(setup_path):
         tm.assert_frame_equal(result, expected)
 
 
+@pytest.mark.xfail(condition=not is_platform_little_endian(),reason="known failure of hdf on non-little endian",strict=False,raises=AttributeError)
 def test_py2_created_with_datetimez(datapath, setup_path):
     # The test HDF5 file was created in Python 2, but could not be read in
     # Python 3.
index aa9294b016a3fffc31afb3bfa647e5c010f253e8..5bd212c9c68822335bf76bdd29ebc3401022b20b 100644 (file)
@@ -8,7 +8,7 @@ from pathlib import Path
 
 import pytest
 
-from pandas.compat import is_platform_windows
+from pandas.compat import is_platform_windows, is_platform_little_endian
 import pandas.util._test_decorators as td
 
 import pandas as pd
@@ -215,11 +215,11 @@ bar2,12,13,14,15
                 "feather",
                 ("io", "data", "feather", "feather-0_3_1.feather"),
             ),
-            (
+            pytest.param(
                 pd.read_hdf,
                 "tables",
                 ("io", "data", "legacy_hdf", "datetimetz_object.h5"),
-            ),
+            marks=pytest.mark.xfail(condition=not is_platform_little_endian(),reason="known failure of hdf on non-little endian",strict=False,raises=AttributeError)),
             (pd.read_stata, "os", ("io", "data", "stata", "stata10_115.dta")),
             (pd.read_sas, "os", ("io", "sas", "data", "test1.sas7bdat")),
             (pd.read_json, "os", ("io", "json", "data", "tsframe_v012.json")),
index 8e459f0cf829829a65d8ae7b6bb33d56f8b4fbfd..a9b0aed6f72fb91508e96b4ce56d931ab1d30a2c 100644 (file)
@@ -25,6 +25,8 @@ from pandas.io.stata import (
     read_stata,
 )
 
+from pandas.compat import is_platform_little_endian
+pytestmark = pytest.mark.xfail(condition=not is_platform_little_endian(),reason="known failure of test_stata on non-little endian",strict=False)
 
 @pytest.fixture()
 def mixed_frame():
@@ -199,7 +201,7 @@ class TestStata:
             # parsed_113 = self.read_dta(self.dta2_113)
 
             # Remove resource warnings
-            w = [x for x in w if x.category is UserWarning]
+            w = [x for x in w if x.category is UserWarning and not "Non-x86 system detected" in str(x.message)]
 
             # should get warning for each call to read_dta
             assert len(w) == 3
@@ -452,7 +454,7 @@ class TestStata:
                 warnings.simplefilter("always", InvalidColumnName)
                 original.to_stata(path, None, version=version)
                 # should get a warning for that format.
-                assert len(w) == 1
+                assert len([x for x in w if not "Non-x86 system detected" in str(x.message)]) == 1
 
             written_and_read_again = self.read_dta(path)
             tm.assert_frame_equal(written_and_read_again.set_index("index"), formatted)
@@ -1747,8 +1749,9 @@ has been incorrectly encoded by Stata or some other software. You should verify
 the string values returned are correct."""
         with tm.assert_produces_warning(UnicodeWarning) as w:
             encoded = read_stata(self.dta_encoding_118)
-            assert len(w) == 151
-            assert w[0].message.args[0] == msg
+            w2 = [x for x in w if not "Non-x86 system detected" in str(x.message)]
+            assert len(w2) == 151
+            assert w2[0].message.args[0] == msg
 
         expected = pd.DataFrame([["Düsseldorf"]] * 151, columns=["kreis1849"])
         tm.assert_frame_equal(encoded, expected)