HDF5 and Stata I/O are broken on some architectures
authorDebian Science Team <debian-science-maintainers@lists.alioth.debian.org>
Sun, 28 Jun 2020 20:47:22 +0000 (21:47 +0100)
committerRebecca N. Palmer <rebecca_palmer@zoho.com>
Sun, 28 Jun 2020 20:47:22 +0000 (21:47 +0100)
Fix some issues, warn on use and xfail tests for the remainder

HDF5 known to fail on s390x, Stata on s390x and ppc64el (in 1.0.0),
but not all architectures have been checked yet

In 0.25.3 HDF5 _crashes_ on armhf, so skip
(pytest-forked allows continuing past a crash,
but still seems to fail on xfailed crashes)

Author: Andreas Tille <tille@debian.org>, Graham Inggs <ginggs@debian.org>, Yaroslav Halchenko <debian@onerussian.com>, Rebecca N. Palmer <rebecca_palmer@zoho.com>
Bug-Debian: https://bugs.debian.org/877419

Gbp-Pq: Name xfail_tests_nonintel_io.patch

pandas/io/clipboards.py
pandas/io/pytables.py
pandas/io/stata.py
pandas/tests/io/pytables/test_pytables.py
pandas/tests/io/test_clipboard.py
pandas/tests/io/test_common.py
pandas/tests/io/test_stata.py
pandas/util/testing.py

index 0006824f09fe7a1c8d2b38a8549243f98f950bd0..7b03686e4bf99d93e6ceac7d4c837bfc9eebc1b8 100644 (file)
@@ -1,6 +1,8 @@
 """ io on the clipboard """
 from io import StringIO
 import warnings
+from pandas.compat import is_platform_little_endian
+warn_clipboard_platform="Non-x86 system detected, clipboard I/O may give wrong results - https://bugs.debian.org/877419" if not is_platform_little_endian() else False
 
 from pandas.core.dtypes.generic import ABCDataFrame
 
@@ -22,6 +24,8 @@ def read_clipboard(sep=r"\s+", **kwargs):  # pragma: no cover
     -------
     parsed : DataFrame
     """
+    if warn_clipboard_platform:
+        warnings.warn(warn_clipboard_platform)
     encoding = kwargs.pop("encoding", "utf-8")
 
     # only utf-8 is valid for passed value because that's what clipboard
@@ -96,6 +100,8 @@ def to_clipboard(obj, excel=True, sep=None, **kwargs):  # pragma: no cover
       - Windows:
       - OS X:
     """
+    if warn_clipboard_platform:
+        warnings.warn(warn_clipboard_platform)
     encoding = kwargs.pop("encoding", "utf-8")
 
     # testing if an invalid encoding is passed to clipboard
index 3433d2560925528b2d9c05b6b7427ffcc9754f3a..24b729b705cbc6c4eb59a89fdef860b82f356875 100644 (file)
@@ -11,6 +11,10 @@ import re
 import time
 from typing import List, Optional, Type, Union
 import warnings
+import platform
+import re
+from pandas.compat import is_platform_little_endian
+warn_hdf_platform = "Non-x86 system detected, HDF(5) format I/O may give wrong results - https://bugs.debian.org/877419" if not bool(re.match('i.?86|x86',platform.uname()[4])) else False
 
 import numpy as np
 
@@ -477,6 +481,8 @@ class HDFStore:
     def __init__(
         self, path, mode=None, complevel=None, complib=None, fletcher32=False, **kwargs
     ):
+        if warn_hdf_platform:
+            warnings.warn(warn_hdf_platform)
 
         if "format" in kwargs:
             raise ValueError("format is not a defined argument for HDFStore")
@@ -698,7 +704,10 @@ class HDFStore:
             self._handle.flush()
             if fsync:
                 try:
-                    os.fsync(self._handle.fileno())
+                    if is_platform_little_endian():
+                        os.fsync(self._handle.fileno())
+                    else:
+                        os.sync() # due to a pytables bad-cast bug, fileno is invalid on 64-bit big-endian
                 except OSError:
                     pass
 
index 8dbcee829ee1e35f4a69568df2f5cf4b557a3a15..b317f3d8dfc8071a97bc1c0b25c0df9b6402e0f6 100644 (file)
@@ -17,6 +17,9 @@ import os
 import struct
 import sys
 import warnings
+import platform
+import re
+warn_stata_platform = "Non-x86 system detected, Stata format I/O may give wrong results - https://bugs.debian.org/877419" if not bool(re.match('i.?86|x86',platform.uname()[4])) else False
 
 from dateutil.relativedelta import relativedelta
 import numpy as np
@@ -911,6 +914,8 @@ class StataParser:
         # NOTE: the byte type seems to be reserved for categorical variables
         # with a label, but the underlying variable is -127 to 100
         # we're going to drop the label and cast to int
+        if warn_stata_platform:
+            warnings.warn(warn_stata_platform)
         self.DTYPE_MAP = dict(
             list(zip(range(1, 245), ["a" + str(i) for i in range(1, 245)]))
             + [
index d67f2c3b7bd66eb24a59e4bb83884c0e0f4e5cfa..4c9a0e15efd463faf8f1651a28ec015fabdf7a63 100644 (file)
@@ -47,6 +47,11 @@ from pandas.io.pytables import (
     read_hdf,
 )
 from pandas.io.pytables import TableIterator  # noqa:E402
+import platform
+import re
+is_intel=bool(re.match('i.?86|x86',platform.uname()[4]))
+from pandas.compat import is_platform_little_endian
+pytestmark = [pytest.mark.xfail(condition=not is_intel,reason="known failure of hdf on some non-x86",strict=False),pytest.mark.forked]
 
 tables = pytest.importorskip("tables")
 
@@ -1097,6 +1102,7 @@ class TestHDFStore(Base):
             check("table", index)
             check("fixed", index)
 
+    @pytest.mark.skipif(condition=not is_intel,reason="crashes on armhf, https://bugs.debian.org/877419")
     @pytest.mark.skipif(
         not is_platform_little_endian(), reason="reason platform is not little endian"
     )
@@ -1129,6 +1135,7 @@ class TestHDFStore(Base):
         ],
     )
     @pytest.mark.parametrize("dtype", ["category", object])
+    @pytest.mark.skipif(condition=not is_intel,reason="similar to tests crashing on armhf, https://bugs.debian.org/877419")
     def test_latin_encoding(self, dtype, val):
         enc = "latin-1"
         nan_rep = ""
@@ -1308,6 +1315,7 @@ class TestHDFStore(Base):
             # read with KeyError before another write
             df.to_hdf(path, "k2")
 
+    @pytest.mark.skipif(condition=not is_intel,reason="crashes on armhf, https://bugs.debian.org/877419")
     def test_append_frame_column_oriented(self):
 
         with ensure_clean_store(self.path) as store:
@@ -3935,6 +3943,7 @@ class TestHDFStore(Base):
             with pytest.raises(NotImplementedError):
                 store.select("dfs", start=0, stop=5)
 
+    @pytest.mark.skipif(condition=not is_intel,reason="crashes on armhf, https://bugs.debian.org/877419")
     def test_select_filter_corner(self):
 
         df = DataFrame(np.random.randn(50, 100))
index fccd52f9916b84d018c546fa3c3c6724c87093a8..d2e294df23b13c66f9399f1c5653cf4f6a6f26f7 100644 (file)
@@ -8,6 +8,7 @@ import pandas as pd
 from pandas import DataFrame, get_option, read_clipboard
 from pandas.util import testing as tm
 from pandas.util.testing import makeCustomDataframe as mkdf
+from pandas.compat import is_platform_little_endian
 
 from pandas.io.clipboard import clipboard_get, clipboard_set
 from pandas.io.clipboard.exceptions import PyperclipException
@@ -258,6 +259,7 @@ class TestClipboard:
 
 @pytest.mark.single
 @pytest.mark.clipboard
+@pytest.mark.xfail(condition=not is_platform_little_endian(),reason="https://bugs.debian.org/877419",strict=False)
 @pytest.mark.skipif(not _DEPS_INSTALLED, reason="clipboard primitives not installed")
 @pytest.mark.parametrize("data", ["\U0001f44d...", "Ωœ∑´...", "abcd..."])
 def test_raw_roundtrip(data):
index 8e09e96fbd4713ba397ca7e4389d007f1ea0dd4d..6a82fed80085b59b3c0ddf5084a64d22b45e03b1 100644 (file)
@@ -8,6 +8,7 @@ import os
 import pytest
 
 from pandas.compat import is_platform_windows
+from pandas.compat import is_platform_little_endian
 import pandas.util._test_decorators as td
 
 import pandas as pd
@@ -213,10 +214,10 @@ bar2,12,13,14,15
             (pd.read_fwf, "os", ("io", "data", "fixed_width_format.txt")),
             (pd.read_excel, "xlrd", ("io", "data", "test1.xlsx")),
             (pd.read_feather, "feather", ("io", "data", "feather-0_3_1.feather")),
-            (
+            pytest.param(
                 pd.read_hdf,
                 "tables",
-                ("io", "data", "legacy_hdf", "datetimetz_object.h5"),
+                ("io", "data", "legacy_hdf", "datetimetz_object.h5"),marks=pytest.mark.xfail(condition=not is_platform_little_endian(),reason="https://bugs.debian.org/877419",strict=False)
             ),
             (pd.read_stata, "os", ("io", "data", "stata10_115.dta")),
             (pd.read_sas, "os", ("io", "sas", "data", "test1.sas7bdat")),
index 1e7d568602656ec4f475b46e99bfec3d723f8aad..ecd05bdfc57179bb46ae80ba28c5a46c712929bc 100644 (file)
@@ -24,7 +24,12 @@ from pandas.io.stata import (
     StataReader,
     read_stata,
 )
+import platform
+import re
+is_intel=bool(re.match('i.?86|x86',platform.uname()[4]))
 
+from pandas.compat import is_platform_little_endian
+pytestmark = pytest.mark.xfail(condition=not is_platform_little_endian(),reason="known failure of test_stata on non-little endian",strict=False)
 
 @pytest.fixture
 def dirpath(datapath):
@@ -196,7 +201,7 @@ class TestStata:
             # parsed_113 = self.read_dta(self.dta2_113)
 
             # Remove resource warnings
-            w = [x for x in w if x.category is UserWarning]
+            w = [x for x in w if x.category is UserWarning and not "Non-x86 system detected" in str(x.message)]
 
             # should get warning for each call to read_dta
             assert len(w) == 3
@@ -453,7 +458,7 @@ class TestStata:
                 warnings.simplefilter("always", InvalidColumnName)
                 original.to_stata(path, None, version=version)
                 # should get a warning for that format.
-                assert len(w) == 1
+                assert len([x for x in w if not "Non-x86 system detected" in str(x.message)]) == 1
 
             written_and_read_again = self.read_dta(path)
             tm.assert_frame_equal(written_and_read_again.set_index("index"), formatted)
@@ -496,6 +501,7 @@ class TestStata:
             written_and_read_again = self.read_dta(path)
             tm.assert_frame_equal(written_and_read_again.set_index("index"), parsed_114)
 
+    @pytest.mark.xfail(condition=not is_intel,reason="https://bugs.debian.org/877419",strict=False)
     @pytest.mark.parametrize(
         "file", ["dta15_113", "dta15_114", "dta15_115", "dta15_117"]
     )
@@ -1264,6 +1270,7 @@ class TestStata:
                 read_labels = sr.variable_labels()
             assert read_labels == variable_labels
 
+    @pytest.mark.xfail(condition=not is_intel,reason="https://bugs.debian.org/877419",strict=False)
     @pytest.mark.parametrize("version", [114, 117])
     def test_invalid_variable_labels(self, version):
         original = pd.DataFrame(
@@ -1330,6 +1337,7 @@ class TestStata:
             with tm.ensure_clean() as path:
                 original.to_stata(path, variable_labels=variable_labels_long)
 
+    @pytest.mark.xfail(condition=not is_intel,reason="https://bugs.debian.org/877419",strict=False)
     def test_default_date_conversion(self):
         # GH 12259
         dates = [
@@ -1775,8 +1783,9 @@ has been incorrectly encoded by Stata or some other software. You should verify
 the string values returned are correct."""
         with tm.assert_produces_warning(UnicodeWarning) as w:
             encoded = read_stata(self.dta_encoding_118)
-            assert len(w) == 151
-            assert w[0].message.args[0] == msg
+            w2 = [x for x in w if not "Non-x86 system detected" in str(x.message)]
+            assert len(w2) == 151
+            assert w2[0].message.args[0] == msg
 
         expected = pd.DataFrame([["Düsseldorf"]] * 151, columns=["kreis1849"])
         tm.assert_frame_equal(encoded, expected)
index a8f0d0da52e1f4fb47f00b8891b98610d9bedc52..330312ac39c3cad47c6d8908b4543f1a31b3eebd 100644 (file)
@@ -7,6 +7,7 @@ import gzip
 import http.client
 import os
 import re
+import platform
 from shutil import rmtree
 import string
 import tempfile
@@ -2692,6 +2693,8 @@ def assert_produces_warning(
                     )
                     assert actual_warning.filename == caller.filename, msg
             else:
+                if actual_warning.category==UserWarning and "Non-x86 system detected" in str(actual_warning.message) and not bool(re.match('i.?86|x86',platform.uname()[4])):
+                    continue
                 extra_warnings.append(
                     (
                         actual_warning.category.__name__,