From: Antonio Valentino Date: Wed, 29 Dec 2021 22:51:43 +0000 (+0000) Subject: Import pytables_3.7.0.orig.tar.gz X-Git-Tag: archive/raspbian/3.10.2-1+rpi1~1^2^2~7 X-Git-Url: https://dgit.raspbian.org/?a=commitdiff_plain;h=40a47bdfd1b8ab3a0b88d1093b54823dd306ee2b;p=pytables.git Import pytables_3.7.0.orig.tar.gz [dgit import orig pytables_3.7.0.orig.tar.gz] --- 40a47bdfd1b8ab3a0b88d1093b54823dd306ee2b diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..57ebf83 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,47 @@ +name: CI + +on: [push, pull_request] + +jobs: + build: + name: ${{ matrix.os }} ${{ matrix.python }} ${{ matrix.name }} + runs-on: ${{ matrix.os }} + defaults: + run: + shell: bash -l {0} + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest, macos-latest, windows-latest] + python: ['3.6', '3.7', '3.8', '3.9', '3.10'] + + steps: + - uses: actions/checkout@v2 + with: + fetch-depth: 0 + submodules: true + - name: Set up Python ${{ matrix.python }} + uses: conda-incubator/setup-miniconda@v2 + with: + python-version: ${{ matrix.python }} + auto-update-conda: true + channels: conda-forge + channel-priority: strict + - name: Install dependencies + run: | + conda install setuptools pip wheel build packaging numpy numexpr cython bzip2 hdf5 lzo + # conda install sphinx sphinx_rtd_theme numpydoc ipython + - name: Source distribution + run: | + python -m build --sdist + - name: Installation + run: | + pip install -v dist/*.tar.gz + - name: 'Run test' + run: | + cd .. && python -m tables.tests.test_all + pt2to3 -h + ptrepack -h + ptdump -h + pttree -h + diff --git a/.github/workflows/ubuntu.yml b/.github/workflows/ubuntu.yml new file mode 100644 index 0000000..bde936f --- /dev/null +++ b/.github/workflows/ubuntu.yml @@ -0,0 +1,41 @@ +name: ubuntu + +on: [push, pull_request] + +jobs: + build: + runs-on: ubuntu-20.04 + steps: + - uses: actions/checkout@v2 + with: + fetch-depth: 0 + submodules: true + - name: Set up Python ${{ matrix.python }} + uses: actions/setup-python@v1 + with: + python-version: ${{ matrix.python }} + - name: Install APT packages + run: | + sudo apt-get update + sudo apt install libblosc-dev libbz2-dev libhdf5-dev liblz4-dev liblzo2-dev libsnappy-dev libzstd-dev zlib1g-dev + sudo apt install python3-all-dev python3-setuptools python3-packaging python3-numexpr # python3-numpy cython3 + sudo apt install python3-pytest python3-pytest-doctestplus + sudo apt install python3-numpydoc python3-sphinx python3-sphinx-rtd-theme python3-ipython + sudo apt install latexmk texlive-fonts-recommended texlive-latex-recommended texlive-latex-extra texlive-plain-generic + - name: Install dependencies + run: | + python -m pip install --upgrade setuptools pip wheel build + python -m pip install --upgrade cython>=0.29.21 numpy>=1.19 + python -c "import numpy as np; print('numpy:', np.__version__)" + - name: Build PyTables + run: make build + env: + PYTABLES_NO_EMBEDDED_LIBS: TRUE + - name: Build HTML documentation + run: make html + - name: Build LaTeX documentation + run: make latex + - name: Source distribution + run: make dist + - name: Test + run: make check diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml new file mode 100644 index 0000000..1408c73 --- /dev/null +++ b/.github/workflows/wheels.yml @@ -0,0 +1,208 @@ +name: Wheels + +# Publish when a (published) GitHub Release is created. +on: + push: + branches: + - master + - 'releases/**' + - 'ci/**' + tags: + - v* + release: + types: + - published + +jobs: + build_wheels: + name: Build wheels on ${{matrix.arch}} for ${{ matrix.os }} + runs-on: ${{ matrix.os }} + env: + HDF5_VERSION: 1.12.1 + strategy: + matrix: + os: [ 'ubuntu-latest', 'macos-latest' ] + arch: [x86_64, aarch64] + exclude: + - os: 'macos-latest' + arch: 'aarch64' + + steps: + - uses: actions/checkout@v2 + with: + fetch-depth: 0 + submodules: true + + - uses: actions/setup-python@v2 + name: Install Python + with: + python-version: '3.9' + + - uses: docker/setup-qemu-action@v1 + if: runner.os == 'Linux' + name: Set up QEMU + + - name: Install prerequisites for macOS + if: runner.os == 'macOS' + env: + # Best compatibility, even with older releases of macOS. + MACOSX_DEPLOYMENT_TARGET: "10.9" + run: | + brew reinstall --build-from-source --no-binaries --force bzip2 lz4 lzo snappy zstd zlib + brew link --overwrite --force bzip2 zlib + + - name: Install cibuildwheel + run: | + python -m pip install --upgrade cibuildwheel + + - name: Build wheels for Linux or macOS (64-bit) + run: | + python -m cibuildwheel --output-dir wheelhouse + env: + CIBW_ARCHS_LINUX: ${{ matrix.arch }} + CIBW_ARCHS_MACOS: ${{ matrix.arch == 'aarch64' && 'arm64' || 'x86_64'}} + CIBW_BUILD: "cp36-* cp37-* cp38-* cp39-* cp310-*" + CIBW_BEFORE_ALL_LINUX: "yum -y update && yum install -y zlib-devel bzip2-devel lzo-devel && ./ci/github/get_hdf5_if_needed.sh" + CIBW_BEFORE_ALL_MACOS: "./ci/github/get_hdf5_if_needed.sh" + CIBW_BEFORE_BUILD: "pip install -r requirements.txt cython>=0.29.21" + CIBW_ENVIRONMENT: "DISABLE_AVX2='TRUE' HDF5_DIR=/tmp/hdf5 CFLAGS=-g0 HDF5_VERSION=${{ env.HDF5_VERSION }} LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/tmp/hdf5/lib/" + MACOSX_DEPLOYMENT_TARGET: "10.9" + CIBW_ENVIRONMENT_MACOS: HDF5_DIR=/tmp/hdf5 HDF5_VERSION=${{ env.HDF5_VERSION }} CFLAGS=-g0 LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/tmp/hdf5/lib/ BZIP2_DIR=/usr/local/opt/bzip2 LDFLAGS+="-L/usr/local/opt/bzip2/lib -L/usr/local/opt/zlib/lib" CPPFLAGS+="-I/usr/local/opt/bzip2/include -I/usr/local/opt/zlib/include" PKG_CONFIG_PATH="/usr/local/opt/zlib/lib/pkgconfig" + CIBW_SKIP: '*-musllinux_*' + + - uses: actions/upload-artifact@v2 + with: + path: ./wheelhouse/*.whl + + + build_wheels_windows: + name: Build wheels on ${{matrix.arch}} for ${{ matrix.os }} + runs-on: ${{ matrix.os }} + strategy: + matrix: + os: [windows-latest] + arch: [win32, win_amd64] + + steps: + - uses: actions/checkout@v2 + with: + fetch-depth: 0 + submodules: true + + - uses: actions/setup-python@v2 + name: Install Python + with: + python-version: '3.9' + + - name: Install Miniconda + uses: conda-incubator/setup-miniconda@v2 + with: + channels: defaults,conda-forge + use-only-tar-bz2: true + + - name: Install cibuildwheel + run: | + python -m pip install --upgrade cibuildwheel + + - name: Build wheels for Windows (${{ matrix.arch }}) + run: cibuildwheel --output-dir wheelhouse + env: + CIBW_BUILD: "cp36-${{ matrix.arch }} cp37-${{ matrix.arch }} cp38-${{ matrix.arch }} cp39-${{ matrix.arch }} cp310-${{ matrix.arch }}" + CIBW_BEFORE_ALL_WINDOWS: "conda create --yes --name=build && conda activate build && conda config --env --set subdir ${{ matrix.arch == 'win32' && 'win-32' || 'win-64' }} && conda install --yes blosc bzip2 hdf5 lz4 lzo snappy zstd zlib" + CIBW_ENVIRONMENT_WINDOWS: 'CONDA_PREFIX="C:\\Miniconda\\envs\\build" PATH="$PATH;C:\\Miniconda\\envs\\build\\Library\\bin"' + CIBW_ENVIRONMENT: "PYTABLES_NO_EMBEDDED_LIBS=true DISABLE_AVX2=true" + CIBW_BEFORE_BUILD: "pip install -r requirements.txt cython>=0.29.21 delvewheel" + CIBW_REPAIR_WHEEL_COMMAND_WINDOWS: "delvewheel repair -w {dest_dir} {wheel}" + + - uses: actions/upload-artifact@v2 + with: + path: ./wheelhouse/*.whl + + test_wheels: + needs: [ build_wheels, build_wheels_windows ] + name: Test ${{ matrix.python-version }} ${{ matrix.arch }} wheels for ${{ matrix.os }} + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [ 'ubuntu-latest', 'windows-latest', 'macos-latest' ] + python-version: ['3.7', '3.8', '3.9','3.10'] + arch: ['x64', 'x86'] + exclude: + - os: 'ubuntu-latest' + arch: 'x86' + - os: 'macos-latest' + arch: 'x86' + + steps: + - uses: actions/download-artifact@v2 + with: + path: ./wheelhouse/ + + - name: Install Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + architecture: ${{ matrix.arch }} + + - name: Install tables on ${{ matrix.python-version }} + run: | + pip install numpy>=1.19.0 numexpr>=2.6.2 + pip install --no-index --find-links wheelhouse/artifact/ tables + + - name: Run tests on ${{ matrix.python-version }} + run: | + python -m tables.tests.test_all + + build_sdist: + name: Build source distribution + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + with: + fetch-depth: 0 + submodules: true + + - uses: actions/setup-python@v2 + name: Install Python + with: + python-version: '3.7' + + - name: Install APT packages + if: contains(${{ matrix.os }}, 'ubuntu') + run: | + sudo apt install libbz2-dev libhdf5-serial-dev liblzo2-dev + sudo apt install latexmk texlive-fonts-recommended texlive-latex-recommended texlive-latex-extra texlive-plain-generic + + - name: Install dependencies + run: | + python -m pip install --upgrade setuptools pip wheel build + python -m pip install -r requirements.txt + python -m pip install cython + python -m pip install sphinx>=1.1 sphinx_rtd_theme numpydoc ipython + + - name: Build sdist + run: make PYTHON=python dist + + - uses: actions/upload-artifact@v2 + with: + path: dist/* + +# upload_pypi: +# needs: [build_wheels, build_sdist] +# runs-on: ubuntu-latest +# # upload to PyPI on every tag starting with 'v' +# if: github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/v') +# # alternatively, to publish when a GitHub Release is created, use the following rule: +# # if: github.event_name == 'release' && github.event.action == 'published' +# steps: +# - uses: actions/download-artifact@v2 +# with: +# name: artifact +# path: dist +# +# - uses: pypa/gh-action-pypi-publish@master +# with: +# user: __token__ +# password: ${{ secrets.pypi_password }} +# # To test: repository_url: https://test.pypi.org/legacy/ diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..7f8c6b0 --- /dev/null +++ b/.gitignore @@ -0,0 +1,89 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ +.pytest_cache/ + +# Sphinx documentation +docs/_build/ + +# Jupyter Notebook +.ipynb_checkpoints + +# pyenv +.python-version + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mypy +.mypy_cache/ + +# PyCharm +.idea + +# PyTables +tables/*.c +src/version.h +a.out +tmp/ + +# misc +*~ + diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..04c71aa --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "c-blosc"] + path = c-blosc + url = https://github.com/Blosc/c-blosc.git diff --git a/.readthedocs.yml b/.readthedocs.yml new file mode 100644 index 0000000..3f256ce --- /dev/null +++ b/.readthedocs.yml @@ -0,0 +1,7 @@ +conda: + file: environment.yml +python: + version: 3 + setup_py_install: true + extra_requirements: + - docs diff --git a/ANNOUNCE.txt.in b/ANNOUNCE.txt.in new file mode 100644 index 0000000..15d939f --- /dev/null +++ b/ANNOUNCE.txt.in @@ -0,0 +1,76 @@ +=========================== + Announcing PyTables @VERSION@ +=========================== + +We are happy to announce PyTables @VERSION@. + + +What's new +========== + +This is a minor version of PyTables. The main feature added is that +compatibility with Python 3.10, numpy 1.21 and HDF5 1.12 has been improved, +while support for Python 3.5 has been dropped. + +The CI infrastructure has been moved to GitHub Actions. + +In case you want to know more in detail what has changed in this +version, please refer to: http://www.pytables.org/release_notes.html + +You can install it via pip or download a source package with generated +PDF and HTML docs from: +https://github.com/PyTables/PyTables/releases/v@VERSION@ + +For an online version of the manual, visit: +http://www.pytables.org/usersguide/index.html + + +What it is? +=========== + +PyTables is a library for managing hierarchical datasets and +designed to efficiently cope with extremely large amounts of data with +support for full 64-bit file addressing. PyTables runs on top of +the HDF5 library and NumPy package for achieving maximum throughput and +convenient use. PyTables includes OPSI, a new indexing technology, +allowing to perform data lookups in tables exceeding 10 gigarows +(10**10 rows) in less than a tenth of a second. + + +Resources +========= + +About PyTables: http://www.pytables.org + +About the HDF5 library: http://hdfgroup.org/HDF5/ + +About NumPy: http://numpy.scipy.org/ + + +Acknowledgments +=============== + +Thanks to many users who provided feature improvements, patches, bug +reports, support and suggestions. See the ``THANKS`` file in the +distribution package for a (incomplete) list of contributors. Most +specially, a lot of kudos go to the HDF5 and NumPy makers. +Without them, PyTables simply would not exist. + + +Share your experience +===================== + +Let us know of any bugs, suggestions, gripes, kudos, etc. you may have. + + +---- + + **Enjoy data!** + + -- The PyTables Developers + +.. Local Variables: +.. mode: rst +.. coding: utf-8 +.. fill-column: 72 +.. End: diff --git a/CITATION.bib b/CITATION.bib new file mode 100644 index 0000000..feb85cf --- /dev/null +++ b/CITATION.bib @@ -0,0 +1,6 @@ +@Misc{, + author = {PyTables Developers Team}, + title = {{PyTables}: Hierarchical Datasets in {Python}}, + year = {2002--}, + url = "https://www.pytables.org/" +} diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..3bdfc88 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,15 @@ +# Contribute to PyTables + +PyTables is actively seeking additional maintainers and developers. If you are interested in becoming on, check out our developer resources on the PyTables website: http://www.pytables.org/development.html#pytables-development + +## Documentation + +In general, the barrier to entry is lower when making a docs contribution. We encourage you to check out our open docs issues: https://github.com/PyTables/PyTables/labels/documentation + +## Stay up to date + +If you want to stay up to date with PyTables development and communicate with the development team, consider joining our developer mailing list: https://groups.google.com/g/pytables-dev + +## Something else? + +Do you have an idea for a contribution but you are not sure if it fits into any of the aforementioned options? Please open an issue and propose it! We are happy to help support contributions of all types. diff --git a/FUNDING.yml b/FUNDING.yml new file mode 100644 index 0000000..6ff3724 --- /dev/null +++ b/FUNDING.yml @@ -0,0 +1,2 @@ +github: [numfocus] +custom: ['https://numfocus.org/donate-to-pytables'] diff --git a/LICENSE.txt b/LICENSE.txt new file mode 100644 index 0000000..601fd76 --- /dev/null +++ b/LICENSE.txt @@ -0,0 +1,35 @@ +Copyright Notice and Statement for PyTables Software Library and Utilities: + +Copyright (c) 2002-2004 by Francesc Alted +Copyright (c) 2005-2007 by Carabos Coop. V. +Copyright (c) 2008-2010 by Francesc Alted +Copyright (c) 2011-2021 by PyTables maintainers +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the + distribution. + +3. Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/LICENSES/BLOSC.txt b/LICENSES/BLOSC.txt new file mode 100644 index 0000000..1226f91 --- /dev/null +++ b/LICENSES/BLOSC.txt @@ -0,0 +1,23 @@ +Blosc - A blocking, shuffling and lossless compression library + +Copyright (C) 2009-2012 Francesc Alted +Copyright (C) 2013 Francesc Alted + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. + diff --git a/LICENSES/CLOUD-SPTHEME.txt b/LICENSES/CLOUD-SPTHEME.txt new file mode 100644 index 0000000..3144724 --- /dev/null +++ b/LICENSES/CLOUD-SPTHEME.txt @@ -0,0 +1,47 @@ +.. -*- restructuredtext -*- + +=================== +Copyright & License +=================== + +Cloud Sphinx Theme +================== +cloud_sptheme is released under the BSD license, +and is (c) `Assurance Technologies `_:: + + The "cloud_sptheme" python package and artwork is + Copyright (c) 2010-2012 by Assurance Technologies, LLC. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of Assurance Technologies, nor the names of the + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +Other Content +============= +Most of the icons in ``cloud_sptheme:themes/cloud/static`` +are from the `Tango Icon Project `_, +which has released them into the Public Domain. diff --git a/LICENSES/FASTLZ.txt b/LICENSES/FASTLZ.txt new file mode 100644 index 0000000..4a6abd6 --- /dev/null +++ b/LICENSES/FASTLZ.txt @@ -0,0 +1,24 @@ +FastLZ - lightning-fast lossless compression library + +Copyright (C) 2007 Ariya Hidayat (ariya@kde.org) +Copyright (C) 2006 Ariya Hidayat (ariya@kde.org) +Copyright (C) 2005 Ariya Hidayat (ariya@kde.org) + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. + diff --git a/LICENSES/H5PY.txt b/LICENSES/H5PY.txt new file mode 100644 index 0000000..081ef02 --- /dev/null +++ b/LICENSES/H5PY.txt @@ -0,0 +1,34 @@ +Copyright Notice and Statement for the h5py Project + +Copyright (c) 2008 Andrew Collette +http://www.h5py.org +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +a. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + +b. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the + distribution. + +c. Neither the name of the author nor the names of contributors may + be used to endorse or promote products derived from this software + without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + diff --git a/LICENSES/HDF5.txt b/LICENSES/HDF5.txt new file mode 100644 index 0000000..54126de --- /dev/null +++ b/LICENSES/HDF5.txt @@ -0,0 +1,69 @@ +HDF5 (Hierarchical Data Format 5) Software Library and Utilities +Copyright 2006-2007 by The HDF Group (THG). + +NCSA HDF5 (Hierarchical Data Format 5) Software Library and Utilities +Copyright 1998-2006 by the Board of Trustees of the University of Illinois. + +All rights reserved. + +Contributors: National Center for Supercomputing Applications (NCSA) +at the University of Illinois, Fortner Software, Unidata Program +Center (netCDF), The Independent JPEG Group (JPEG), Jean-loup Gailly +and Mark Adler (gzip), and Digital Equipment Corporation (DEC). + +Redistribution and use in source and binary forms, with or without +modification, are permitted for any purpose (including commercial +purposes) provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright +notice, this list of conditions, and the following disclaimer. + 2. Redistributions in binary form must reproduce the above +copyright notice, this list of conditions, and the following +disclaimer in the documentation and/or materials provided with the +distribution. + 3. In addition, redistributions of modified forms of the source or +binary code must carry prominent notices stating that the original +code was changed and the date of the change. + 4. All publications or advertising materials mentioning features or +use of this software are asked, but not required, to acknowledge that +it was developed by The HDF Group and by the National Center for +Supercomputing Applications at the University of Illinois at +Urbana-Champaign and credit the contributors. + 5. Neither the name of The HDF Group, the name of the University, +nor the name of any Contributor may be used to endorse or promote +products derived from this software without specific prior written +permission from THG, the University, or the Contributor, respectively. + +DISCLAIMER: THIS SOFTWARE IS PROVIDED BY THE HDF GROUP (THG) AND THE +CONTRIBUTORS "AS IS" WITH NO WARRANTY OF ANY KIND, EITHER EXPRESSED OR +IMPLIED. In no event shall THG or the Contributors be liable for any +damages suffered by the users arising out of the use of this software, +even if advised of the possibility of such damage. + +Portions of HDF5 were developed with support from the University of +California, Lawrence Livermore National Laboratory (UC LLNL). The +following statement applies to those portions of the product and must +be retained in any redistribution of source code, binaries, +documentation, and/or accompanying materials: + +This work was partially produced at the University of California, +Lawrence Livermore National Laboratory (UC LLNL) under contract +no. W-7405-ENG-48 (Contract 48) between the U.S. Department of Energy +(DOE) and The Regents of the University of California (University) for +the operation of UC LLNL. + +DISCLAIMER: This work was prepared as an account of work sponsored by +an agency of the United States Government. Neither the United States +Government nor the University of California nor any of their +employees, makes any warranty, express or implied, or assumes any +liability or responsibility for the accuracy, completeness, or +usefulness of any information, apparatus, product, or process +disclosed, or represents that its use would not infringe privately- +owned rights. Reference herein to any specific commercial products, +process, or service by trade name, trademark, manufacturer, or +otherwise, does not necessarily constitute or imply its endorsement, +recommendation, or favoring by the United States Government or the +University of California. The views and opinions of authors expressed +herein do not necessarily state or reflect those of the United States +Government or the University of California, and shall not be used for +advertising or product endorsement purposes. diff --git a/LICENSES/LZ4.txt b/LICENSES/LZ4.txt new file mode 100644 index 0000000..2383e10 --- /dev/null +++ b/LICENSES/LZ4.txt @@ -0,0 +1,32 @@ +LZ4 - Fast LZ compression algorithm + +Copyright (C) 2011-2014, Yann Collet. +BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +You can contact the author at : +- LZ4 homepage : http://fastcompression.blogspot.com/p/lz4.html +- LZ4 source repository : http://code.google.com/p/lz4/ + diff --git a/LICENSES/SNAPPY.txt b/LICENSES/SNAPPY.txt new file mode 100644 index 0000000..8d6bd9f --- /dev/null +++ b/LICENSES/SNAPPY.txt @@ -0,0 +1,28 @@ +Copyright 2011, Google Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. + * Neither the name of Google Inc. nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/LICENSES/STDINT.txt b/LICENSES/STDINT.txt new file mode 100644 index 0000000..486e694 --- /dev/null +++ b/LICENSES/STDINT.txt @@ -0,0 +1,26 @@ +Copyright (c) 2006-2013 Alexander Chemeris + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + 3. Neither the name of the product nor the names of its contributors may + be used to endorse or promote products derived from this software + without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED +WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO +EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF +ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. \ No newline at end of file diff --git a/LICENSES/WIN32PTHREADS.txt b/LICENSES/WIN32PTHREADS.txt new file mode 100644 index 0000000..bd5ced5 --- /dev/null +++ b/LICENSES/WIN32PTHREADS.txt @@ -0,0 +1,19 @@ +Copyright (C) 2009 Andrzej K. Haczewski + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. diff --git a/LICENSES/ZLIB.txt b/LICENSES/ZLIB.txt new file mode 100644 index 0000000..5d74f5c --- /dev/null +++ b/LICENSES/ZLIB.txt @@ -0,0 +1,22 @@ +Copyright notice: + + (C) 1995-2013 Jean-loup Gailly and Mark Adler + + This software is provided 'as-is', without any express or implied + warranty. In no event will the authors be held liable for any damages + arising from the use of this software. + + Permission is granted to anyone to use this software for any purpose, + including commercial applications, and to alter it and redistribute it + freely, subject to the following restrictions: + + 1. The origin of this software must not be misrepresented; you must not + claim that you wrote the original software. If you use this software + in a product, an acknowledgment in the product documentation would be + appreciated but is not required. + 2. Altered source versions must be plainly marked as such, and must not be + misrepresented as being the original software. + 3. This notice may not be removed or altered from any source distribution. + + Jean-loup Gailly Mark Adler + jloup@gzip.org madler@alumni.caltech.edu diff --git a/LICENSES/ZSTD.TXT b/LICENSES/ZSTD.TXT new file mode 100644 index 0000000..a793a80 --- /dev/null +++ b/LICENSES/ZSTD.TXT @@ -0,0 +1,30 @@ +BSD License + +For Zstandard software + +Copyright (c) 2016-present, Facebook, Inc. All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + + * Neither the name Facebook nor the names of its contributors may be used to + endorse or promote products derived from this software without specific + prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..5b6b978 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,28 @@ +include MANIFEST.in +include *.txt THANKS README.rst +include setup.py setup.cfg VERSION Makefile cpuinfo.py + +recursive-include tables *.py *.pyx *.pxd *.c +recursive-include tables/tests *.h5 *.mat +recursive-include tables/nodes/tests *.h5 *.dat *.xbm +recursive-include src *.c *.h Makefile + +include hdf5-blosc/src/blosc_filter.? +recursive-include c-blosc/blosc *.c *.h *.inc +recursive-include c-blosc/internal-complibs *.c *.cc *.h + +recursive-include LICENSES * +recursive-include utils * +include doc/Makefile doc/make.bat +#include doc/*.pdf +recursive-include doc *.rst *.conf *.py *.*_t +recursive-include doc *.html *.js *.css *.png *.ico +recursive-include doc/source *.pdf objecttree.svg +#recursive-include doc/source *.pdf *.svg +recursive-include doc/html *.txt *.svg *.gif *.inv +recursive-include doc/scripts *.py +recursive-include doc/sphinxext * +recursive-exclude doc/build * +recursive-include examples *.py *.sh +recursive-include bench *.sh *.py *.txt *.h5 *.gnuplot +recursive-include contrib README *.py diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..7d0bb2a --- /dev/null +++ b/Makefile @@ -0,0 +1,69 @@ +# This Makefile is only intended to prepare for distribution the PyTables +# sources exported from a repository. For building and installing PyTables, +# please use ``setup.py`` as described in the ``README.rst`` file. + +VERSION = $(shell grep "__version__ =" tables/__init__.py | cut -f 3 -d ' ' | sed s/\"//g) +SRCDIRS = src doc +GENERATED = ANNOUNCE.txt +PYTHON = python3 +PYPLATFORM = $(shell $(PYTHON) -c "from distutils.util import get_platform; print(get_platform())") +PYVER = $(shell $(PYTHON) -V 2>&1 | cut -c 8-10) +PYBUILDDIR = $(PWD)/build/lib.$(PYPLATFORM)-$(PYVER) +OPT = PYTHONPATH=$(PYBUILDDIR) + + +.PHONY: all dist build check heavycheck clean distclean html + +all: $(GENERATED) build html + +dist: all latex + # $(PYTHON) -m build --sdist # --no-isolation + $(PYTHON) setup.py sdist + cp RELEASE_NOTES.rst dist/RELEASE_NOTES-$(VERSION).rst + cp doc/usersguide-$(VERSION).pdf dist/pytablesmanual-$(VERSION).pdf + tar cvzf dist/pytablesmanual-$(VERSION)-html.tar.gz doc/html + cd dist && \ + md5sum -b tables-$(VERSION).tar.gz RELEASE_NOTES-$(VERSION).rst \ + pytablesmanual-$(VERSION).pdf \ + pytablesmanual-$(VERSION)-html.tar.gz > pytables-$(VERSION).md5 && \ + cd - + +clean: + $(RM) -r MANIFEST build dist tmp tables/__pycache__ + $(RM) bench/*.h5 bench/*.prof + $(RM) -r examples/*.h5 examples/raw + $(RM) -r *.egg-info + $(RM) $(GENERATED) tables/*.so a.out + find . '(' -name '*.py[co]' -o -name '*~' ')' -exec rm '{}' ';' + for srcdir in $(SRCDIRS) ; do $(MAKE) -C $$srcdir $(OPT) $@ ; done + +distclean: clean + $(MAKE) -C src $(OPT) $@ + $(RM) tables/_comp_*.c tables/*extension.c + $(RM) doc/usersguide-*.pdf + $(RM) -r doc/html + $(RM) -r .pytest_cache + # git clean -fdx + +html: build + $(MAKE) -C doc $(OPT) html + $(RM) -r doc/html + cp -R doc/build/html doc/html + +latex: + $(MAKE) -C doc $(OPT) latexpdf + $(RM) doc/usersguide-*.pdf + cp doc/build/latex/usersguide-$(VERSION).pdf doc + +%: %.in tables/__init__.py + cat "$<" | sed -e 's/@VERSION@/$(VERSION)/g' > "$@" + +build: + $(PYTHON) setup.py build + +check: build + cd build/lib.*-$(PYVER) && env PYTHONPATH=. $(PYTHON) -m pytest --doctest-only --pyargs tables -k "not AttributeSet" + cd build/lib.*-$(PYVER) && env PYTHONPATH=. $(PYTHON) tables/tests/test_all.py + +heavycheck: build + cd build/lib.*-$(PYVER) && env PYTHONPATH=. $(PYTHON) tables/tests/test_all.py --heavy diff --git a/README.rst b/README.rst new file mode 100644 index 0000000..523140d --- /dev/null +++ b/README.rst @@ -0,0 +1,154 @@ +=========================================== + PyTables: hierarchical datasets in Python +=========================================== + +.. image:: https://badges.gitter.im/Join%20Chat.svg + :alt: Join the chat at https://gitter.im/PyTables/PyTables + :target: https://gitter.im/PyTables/PyTables + +.. image:: https://github.com/PyTables/PyTables/workflows/CI/badge.svg + :target: https://github.com/PyTables/PyTables/actions?query=workflow%3ACI + +.. image:: https://img.shields.io/pypi/v/tables.svg + :target: https://pypi.org/project/tables/ + +.. image:: https://img.shields.io/pypi/pyversions/tables.svg + :target: https://pypi.org/project/tables/ + +.. image:: https://img.shields.io/pypi/l/tables + :target: https://github.com/PyTables/PyTables/ + + +:URL: http://www.pytables.org/ + + +PyTables is a package for managing hierarchical datasets and designed +to efficiently cope with extremely large amounts of data. + +It is built on top of the HDF5 library and the NumPy package. It +features an object-oriented interface that, combined with C extensions +for the performance-critical parts of the code (generated using +Cython), makes it a fast, yet extremely easy to use tool for +interactively save and retrieve very large amounts of data. One +important feature of PyTables is that it optimizes memory and disk +resources so that they take much less space (between a factor 3 to 5, +and more if the data is compressible) than other solutions, like for +example, relational or object oriented databases. + +State-of-the-art compression +---------------------------- + +PyTables comes with out-of-box support for the `Blosc compressor +`_. This allows for extremely high compression +speed, while keeping decent compression ratios. By doing so, I/O can +be accelerated by a large extent, and you may end achieving higher +performance than the bandwidth provided by your I/O subsystem. See +the `Tuning The Chunksize section of the Optimization Tips chapter +`_ +of user documentation for some benchmarks. + +Not a RDBMS replacement +----------------------- + +PyTables is not designed to work as a relational database replacement, +but rather as a teammate. If you want to work with large datasets of +multidimensional data (for example, for multidimensional analysis), or +just provide a categorized structure for some portions of your +cluttered RDBS, then give PyTables a try. It works well for storing +data from data acquisition systems (DAS), simulation software, network +data monitoring systems (for example, traffic measurements of IP +packets on routers), or as a centralized repository for system logs, +to name only a few possible uses. + +Tables +------ + +A table is defined as a collection of records whose values are stored +in fixed-length fields. All records have the same structure and all +values in each field have the same data type. The terms "fixed-length" +and strict "data types" seems to be quite a strange requirement for an +interpreted language like Python, but they serve a useful function if +the goal is to save very large quantities of data (such as is +generated by many scientific applications, for example) in an +efficient manner that reduces demand on CPU time and I/O. + +Arrays +------ + +There are other useful objects like arrays, enlargeable arrays or +variable length arrays that can cope with different missions on your +project. + +Easy to use +----------- + +One of the principal objectives of PyTables is to be user-friendly. +In addition, many different iterators have been implemented so as to +enable the interactive work to be as productive as possible. + +Platforms +--------- + +We are using Linux on top of Intel32 and Intel64 boxes as the main +development platforms, but PyTables should be easy to compile/install +on other UNIX or Windows machines. + +Compiling +--------- + +To compile PyTables you will need, at least, a recent version of HDF5 +(C flavor) library, the Zlib compression library and the NumPy and +Numexpr packages. Besides, it comes with support for the Blosc, LZO +and bzip2 compressor libraries. Blosc is mandatory, but PyTables comes +with Blosc sources so, although it is recommended to have Blosc +installed in your system, you don't absolutely need to install it +separately. LZO and bzip2 compression libraries are, however, +optional. + +Installation +------------ + +1. Make sure you have HDF5 version 1.8.4 or above. + + On OSX you can install HDF5 using `Homebrew `_:: + + $ brew install hdf5 + + On debian bases distributions:: + + $ sudo apt-get install libhdf5-serial-dev + + If you have the HDF5 library in some non-standard location (that + is, where the compiler and the linker can't find it) you can use + the environment variable `HDF5_DIR` to specify its location. See + `the manual + `_ for more + details. + +3. For stability (and performance too) reasons, it is strongly + recommended that you install the C-Blosc library separately, + although you might want PyTables to use its internal C-Blosc + sources. + +3. Optionally, consider to install the LZO compression library and/or + the bzip2 compression library. + +4. Install!:: + + $ python3 -m pip install tables + +5. To run the test suite run:: + + $ python3 -m tables.tests.test_all + + If there is some test that does not pass, please send the + complete output for tests back to us. + + +**Enjoy data!** -- The PyTables Team + +.. Local Variables: +.. mode: text +.. coding: utf-8 +.. fill-column: 70 +.. End: diff --git a/RELEASE_NOTES.rst b/RELEASE_NOTES.rst new file mode 100644 index 0000000..234d3db --- /dev/null +++ b/RELEASE_NOTES.rst @@ -0,0 +1,52 @@ +======================================= + Release notes for PyTables 3.7 series +======================================= + +:Author: PyTables Developers +:Contact: pytables-dev@googlegroups.com + +.. py:currentmodule:: tables + + +Changes from 3.6.1 to 3.7.0 +=========================== + +Improvements +------------ +- Compatibility with Python 3.10, numpy 1.21 and HDF5 1.12. +- Support for Python 3.5 has been dropped (:issue:`840` and :issue:`850`). +- Windows: Significantly faster `import tables` PR #781. + Thanks to Christoph Gohlke. +- Internal C-Blosc sources updated to 1.21.1 (:issue:`931`). + Note that, starting from C-Blosc 1.19 does not include the Snappy codec + sources anymore, so Snappy will be not available if you compile from + included sources; other packages (like conda or wheels), + may (or may not) include it. +- Stop using appveyor and deprecated ci-helpers (closes :issue:`827`). +- Switch to `git submodule` for the management of vendored c-blosc sources. +- CI moved to GitHub Actions (GHA). +- Drop Travis-CI. +- Improved code formatting and notation consistency (:issue:`873`, + :issue:`868`, :issue:`865` thanks to Miroslav Šedivý). +- Improve the use of modern Python including :mod:`pathlib`, f-strings + (:issue:`859`, :issue:`855`, :issue:`839` and :issue:`818` + thanks to Miroslav Šedivý). +- Several improvements to wheels generation in CI + (thanks to Andreas Motl @amotl and Matthias @xmatthias). +- Simplified management of version information. +- Drop dependency on the deprecated distutils. +- Modernize the setup script and add support for PEP517 (:issue:`907`). + +Bugfixes +-------- +- Fix `pkg-config` (`setup.py`) for Python 3.9 on Debian. + Thanks to Marco Sulla PR #792. +- Fix ROFileNode fails to return the `fileno()` (:issue:`633`). +- Do not flush read only files (:issue:`915` thanks to @lrepiton). + +Other changes +------------- +- Drop the deprecated `hdf5Version` and `File.open_count`. +- the :func:`get_tables_version` and :func:`get_hdf5_version` functions are + now deprecated please use the coresponding :data:`tables.__version__` and + :data:`tables.hdf5_version` instead. diff --git a/THANKS b/THANKS new file mode 100644 index 0000000..0ba6df4 --- /dev/null +++ b/THANKS @@ -0,0 +1,86 @@ +March 2009 + +We would like to thank the people have contributed directly or +indirectly to PyTables. + +Scott Prater for editing the user's manual in order to make it more +readable in english, as well as conducting the tests of PyTables on +Solaris. + +Alan McIntyre for porting PyTables to Windows. + +John Nielsen for suggesting improvements and delivering code for +completely avoid the recursion algorithms and allowing pytables to +bypass the ~1000 levels of deepness that Python recursion limit +imposed. + +Tom Hedley for providing a nice patch for supporting complex datatypes +for Arrays, Errays and VLArrays. This was the root for the support of +complex types in Tables as well. + +Shack Toms for providing a Python version of the nextafter and +nextafterf math functions that despite the fact they are standard in +C99 standard, they are not at the official places in Microsoft VC++ +6.x nor VC++ 7.x. + +Jeff Whitaker for providing the NetCDF module and the utility for +converting netCDF files to HDF5 (nctoh5). + +Norbert Nemec for providing several interesting patches. + +Andrew Straw for suggesting to bracket the most intensive parts of +PyTables with BEGIN_ALLOW_THREADS and END_ALLOW_THREADS. That will +allow much better performance of PyTables apps in mutiprocessors +platforms. + +Antonio Valentino for providing several patches for supporting +native multidimensional attributes and the CArray object. + +Ashley Walsh, for reporting several problems and fixes. It has helped +testing OSX platform, specially UCL compressor issues. + +Russel Howe, for reporting and providing an initial patch for a nasty +memory leak when reading VLArray types. + +The HDF5 team at NCSA for making such an excellent library for data +persistence, and specially Pedro Vicente, Quincey Koziol and Elena +Pourmal, for quickly including my suggested patches to the HDF5_HL and +solving the reported bugs in HDF5 library. + +Todd Miller and Perry Greenfield for promptly helping me to understand +many of the intricacies of the numarray package and Jin-chung Hsu for +discussions on recarray module (now numarray.records module). They +have been very receptive and promptly worked-out most of the +improvements in numarray (specially in the records module) that were +necessary for PyTables. + +Travis Oliphant for its impressive work and responsiveness with NumPy. + +Evan Prodromou for his lrucache package, a very sleek implementation +of an LRU queue. He had a very helpful attitude with the licensing +and technical issues. + +Gerard Vermeulen for Windows/MSVS-2005 testing. + +Enric Cervera for testing the binaries for MacOSX/Intel. + +Daniel Bungert, Steve Langasek and Alexandre Fayolle for their support +in creating Debian packages for PyTables. + +Greg Ewing for writing the excelent Pyrex tool and allowing to beginners +like me to quickly and safely start writing Python extensions. He was +also very responsive about questions on Pyrex. + +Stefan Behnel, Robert Bradshaw, and Dag Sverre Seljebotn for their +impressive work with Cython. + +Andrew Collette, for his excellent work on the h5py project, from +which PyTables starts to stole ideas (and code too ;-). + +Guido, you know who. + +And last, but definitely not least!, + +To those companies that are supporting the PyTables project with +contracts. + diff --git a/bench/LRU-experiments.py b/bench/LRU-experiments.py new file mode 100644 index 0000000..972c2b0 --- /dev/null +++ b/bench/LRU-experiments.py @@ -0,0 +1,96 @@ +# Testbed to perform experiments in order to determine best values for +# the node numbers in LRU cache. Tables version. + +from time import perf_counter as clock +import tables as tb + +print("PyTables version-->", tb.__version__) + +filename = "/tmp/junk-tables-100.h5" +NLEAVES = 2000 +NROWS = 1000 + + +class Particle(tb.IsDescription): + name = tb.StringCol(16, pos=1) # 16-character String + lati = tb.Int32Col(pos=2) # integer + longi = tb.Int32Col(pos=3) # integer + pressure = tb.Float32Col(pos=4) # float (single-precision) + temperature = tb.Float64Col(pos=5) # double (double-precision) + + +def create_junk(): + # Open a file in "w"rite mode + fileh = tb.open_file(filename, mode="w") + # Create a new group + group = fileh.create_group(fileh.root, "newgroup") + + for i in range(NLEAVES): + # Create a new table in newgroup group + table = fileh.create_table(group, 'table' + str(i), Particle, + "A table", tb.Filters(1)) + particle = table.row + print("Creating table-->", table._v_name) + + # Fill the table with particles + for i in range(NROWS): + # This injects the row values. + particle.append() + table.flush() + + # Finally, close the file + fileh.close() + + +def modify_junk_LRU(): + fileh = tb.open_file(filename, 'a') + group = fileh.root.newgroup + for j in range(5): + print("iter -->", j) + for tt in fileh.walk_nodes(group): + if isinstance(tt, tb.Table): + pass +# for row in tt: +# pass + fileh.close() + + +def modify_junk_LRU2(): + fileh = tb.open_file(filename, 'a') + group = fileh.root.newgroup + for j in range(20): + t1 = clock() + for i in range(100): + #print("table-->", tt._v_name) + tt = getattr(group, "table" + str(i)) + #for row in tt: + # pass + print(f"iter and time --> {j + 1} {clock() - t1:.3f}") + fileh.close() + + +def modify_junk_LRU3(): + fileh = tb.open_file(filename, 'a') + group = fileh.root.newgroup + for j in range(3): + t1 = clock() + for tt in fileh.walk_nodes(group, "Table"): + tt.attrs.TITLE + for row in tt: + pass + print(f"iter and time --> {j + 1} {clock() - t1:.3f}") + fileh.close() + +if 1: + # create_junk() + # modify_junk_LRU() # uses the iterator version (walk_nodes) + # modify_junk_LRU2() # uses a regular loop (getattr) + modify_junk_LRU3() # uses a regular loop (getattr) +else: + import profile + import pstats + profile.run('modify_junk_LRU2()', 'modify.prof') + stats = pstats.Stats('modify.prof') + stats.strip_dirs() + stats.sort_stats('time', 'calls') + stats.print_stats() diff --git a/bench/LRU-experiments2.py b/bench/LRU-experiments2.py new file mode 100644 index 0000000..cfb7a65 --- /dev/null +++ b/bench/LRU-experiments2.py @@ -0,0 +1,56 @@ +# Testbed to perform experiments in order to determine best values for +# the node numbers in LRU cache. Arrays version. + +from time import perf_counter as clock +import tables as tb + +print("PyTables version-->", tb.__version__) + +filename = "/tmp/junk-array.h5" +NOBJS = 1000 + + +def create_junk(): + fileh = tb.open_file(filename, mode="w") + for i in range(NOBJS): + fileh.create_array(fileh.root, 'array' + str(i), [1]) + fileh.close() + + +def modify_junk_LRU(): + fileh = tb.open_file(filename, 'a') + group = fileh.root + for j in range(5): + print("iter -->", j) + for tt in fileh.walk_nodes(group): + if isinstance(tt, tb.Array): +# d = tt.read() + pass + + fileh.close() + + +def modify_junk_LRU2(): + fileh = tb.open_file(filename, 'a') + group = fileh.root + for j in range(5): + t1 = clock() + for i in range(100): # The number + #print("table-->", tt._v_name) + tt = getattr(group, "array" + str(i)) + #d = tt.read() + print(f"iter and time --> {j + 1} {clock() - t1:.3f}") + fileh.close() + +if 1: + # create_junk() + # modify_junk_LRU() # uses the iterador version (walk_nodes) + modify_junk_LRU2() # uses a regular loop (getattr) +else: + import profile + import pstats + profile.run('modify_junk_LRU2()', 'modify.prof') + stats = pstats.Stats('modify.prof') + stats.strip_dirs() + stats.sort_stats('time', 'calls') + stats.print_stats() diff --git a/bench/LRUcache-node-bench.py b/bench/LRUcache-node-bench.py new file mode 100644 index 0000000..1573d5f --- /dev/null +++ b/bench/LRUcache-node-bench.py @@ -0,0 +1,77 @@ +import sys +import numpy as np +import tables as tb +from time import perf_counter as clock +#import psyco + +filename = "/tmp/LRU-bench.h5" +nodespergroup = 250 +niter = 100 + +print('nodespergroup:', nodespergroup) +print('niter:', niter) + +if len(sys.argv) > 1: + NODE_CACHE_SLOTS = int(sys.argv[1]) + print('NODE_CACHE_SLOTS:', NODE_CACHE_SLOTS) +else: + NODE_CACHE_SLOTS = tb.parameters.NODE_CACHE_SLOTS +f = tb.open_file(filename, "w", node_cache_slots=NODE_CACHE_SLOTS) +g = f.create_group("/", "NodeContainer") +print("Creating nodes") +for i in range(nodespergroup): + f.create_array(g, "arr%d" % i, [i]) +f.close() + +f = tb.open_file(filename) + + +def iternodes(): +# for a in f.root.NodeContainer: +# pass + indices = np.random.randn(nodespergroup * niter) * 30 + nodespergroup / 2 + indices = indices.astype('i4').clip(0, nodespergroup - 1) + g = f.get_node("/", "NodeContainer") + for i in indices: + a = f.get_node(g, "arr%d" % i) + # print("a-->", a) + +print("reading nodes...") +# First iteration (put in LRU cache) +t1 = clock() +for a in f.root.NodeContainer: + pass +print(f"time (init cache)--> {clock() - t1:.3f}") + + +def timeLRU(): + # Next iterations + t1 = clock() +# for i in range(niter): +# iternodes() + iternodes() + print(f"time (from cache)--> {(clock() - t1) / niter:.3f}") + + +def profile(verbose=False): + import pstats + import cProfile as prof + prof.run('timeLRU()', 'out.prof') + stats = pstats.Stats('out.prof') + stats.strip_dirs() + stats.sort_stats('time', 'calls') + if verbose: + stats.print_stats() + else: + stats.print_stats(20) + +# profile() +# psyco.bind(timeLRU) +timeLRU() + +f.close() + +# for N in 0 4 8 16 32 64 128 256 512 1024 2048 4096; do +# env PYTHONPATH=../build/lib.linux-x86_64-2.7 \ +# python LRUcache-node-bench.py $N; +# done diff --git a/bench/bench-postgres-ranges.sh b/bench/bench-postgres-ranges.sh new file mode 100755 index 0000000..be72475 --- /dev/null +++ b/bench/bench-postgres-ranges.sh @@ -0,0 +1,14 @@ +#!/bin/sh + +export PYTHONPATH=..${PYTHONPATH:+:$PYTHONPATH} + +pyopt="-O -u" +#qlvl="-Q8 -x" +#qlvl="-Q8" +qlvl="-Q7" +#size="500m" +size="1g" + +#python $pyopt indexed_search.py -P -c -n $size -m -v +python $pyopt indexed_search.py -P -i -n $size -m -v -sfloat $qlvl + diff --git a/bench/bench-pytables-ranges.sh b/bench/bench-pytables-ranges.sh new file mode 100755 index 0000000..d73efdf --- /dev/null +++ b/bench/bench-pytables-ranges.sh @@ -0,0 +1,30 @@ +#!/bin/sh + +#export LD_LIBRARY_PATH=$HOME/computacio/hdf5-1.8.2/hdf5/lib +export PYTHONPATH=..${PYTHONPATH:+:$PYTHONPATH} + +bench="python2.7 -O -u indexed_search.py" +flags="-T -m -v " +#sizes="1g 500m 200m 100m 50m 20m 10m 5m 2m 1m" +sizes="1g" +#sizes="1m" +working_dir="data.nobackup" +#working_dir="/scratch2/faltet" + +#for comprlvl in '-z0' '-z1 -llzo' '-z1 -lzlib' ; do +#for comprlvl in '-z6 -lblosc' '-z3 -lblosc' '-z1 -lblosc' ; do +for comprlvl in '-z5 -lblosc' ; do +#for comprlvl in '-z0' ; do + for optlvl in '-tfull -O9' ; do + #for optlvl in '-tultralight -O3' '-tlight -O6' '-tmedium -O6' '-tfull -O9'; do + #for optlvl in '-tultralight -O3'; do + #rm -f $working_dir/* # XXX esta ben posat?? + for mode in '-Q8 -i -s float' ; do + #for mode in -c '-Q7 -i -s float' ; do + #for mode in '-c -s float' '-Q8 -I -s float' '-Q8 -S -s float'; do + for size in $sizes ; do + $bench $flags $mode -n $size $optlvl $comprlvl -d $working_dir + done + done + done +done diff --git a/bench/bench-pytables.sh b/bench/bench-pytables.sh new file mode 100755 index 0000000..caa620b --- /dev/null +++ b/bench/bench-pytables.sh @@ -0,0 +1,28 @@ +#!/bin/sh + +export LD_LIBRARY_PATH=$HOME/computacio/hdf5-1.8.1/hdf5/lib +#export PYTHONPATH=..${PYTHONPATH:+:$PYTHONPATH} + +bench="python2.7 -O -u indexed_search.py" +flags="-T -m -v -d data.nobackup" +#sizes="1m 2m 5m 10m 20m 50m 100m 200m 500m 1g" +sizes="2g 1g 500m 200m 100m 50m 20m 10m 5m 2m 1m 500k 200k 100k 50k 20k 10k 5k 2k 1k" +#sizes="1m 100k" + +#for optimlvl in 0 1 2 3 4 5 6 7 8 9 ; do +for idxtype in ultralight light medium full; do +#for idxtype in medium full; do + for optimlvl in 0 3 6 9; do + for compropt in '' '-z1 -lzlib' '-z1 -llzo' ; do + #for compropt in '-z1 -llzo' ; do + rm -rf data.nobackup/* # Atencio: esta correctament posat? + #for mode in -c '-i -s float' ; do + for mode in -c '-i' ; do + for size in $sizes ; do + $bench $flags $mode -n $size -O $optimlvl -t $idxtype $compropt + done + done + done + done +done +rm -rf data.nobackup diff --git a/bench/blosc.py b/bench/blosc.py new file mode 100644 index 0000000..f5c9ac0 --- /dev/null +++ b/bench/blosc.py @@ -0,0 +1,164 @@ +import sys +from pathlib import Path +from time import perf_counter as clock +import numpy as np +import tables as tb + + +niter = 3 +dirname = "/scratch2/faltet/blosc-data/" +#expression = "a**2 + b**3 + 2*a*b + 3" +#expression = "a+b" +#expression = "a**2 + 2*a/b + 3" +#expression = "(a+b)**2 - (a**2 + b**2 + 2*a*b) + 1.1" +expression = "3*a-2*b+1.1" +shuffle = True + + +def create_file(kind, prec, synth): + prefix_orig = 'cellzome/cellzome-' + iname = dirname + prefix_orig + 'none-' + prec + '.h5' + f = tb.open_file(iname, "r") + + if prec == "single": + type_ = tb.Float32Atom() + else: + type_ = tb.Float64Atom() + + if synth: + prefix = 'synth/synth-' + else: + prefix = 'cellzome/cellzome-' + + for clevel in range(10): + oname = '%s/%s-%s%d-%s.h5' % (dirname, prefix, kind, clevel, prec) + # print "creating...", iname + f2 = tb.open_file(oname, "w") + + if kind in ["none", "numpy"]: + filters = None + else: + filters = tb.Filters( + complib=kind, complevel=clevel, shuffle=shuffle) + + for name in ['maxarea', 'mascotscore']: + col = f.get_node('/', name) + r = f2.create_carray('/', name, type_, col.shape, filters=filters) + if synth: + r[:] = np.arange(col.nrows, dtype=type_.dtype) + else: + r[:] = col[:] + f2.close() + if clevel == 0: + size = 1.5 * Path(oname).stat().st_size + f.close() + return size + + +def create_synth(kind, prec): + + prefix_orig = 'cellzome/cellzome-' + iname = dirname + prefix_orig + 'none-' + prec + '.h5' + f = tb.open_file(iname, "r") + + if prec == "single": + type_ = tb.Float32Atom() + else: + type_ = tb.Float64Atom() + + prefix = 'synth/synth-' + for clevel in range(10): + oname = '%s/%s-%s%d-%s.h5' % (dirname, prefix, kind, clevel, prec) + # print "creating...", iname + f2 = tb.open_file(oname, "w") + + if kind in ["none", "numpy"]: + filters = None + else: + filters = tb.Filters( + complib=kind, complevel=clevel, shuffle=shuffle) + + for name in ['maxarea', 'mascotscore']: + col = f.get_node('/', name) + r = f2.create_carray('/', name, type_, col.shape, filters=filters) + if name == 'maxarea': + r[:] = np.arange(col.nrows, dtype=type_.dtype) + else: + r[:] = np.arange(col.nrows, 0, dtype=type_.dtype) + + f2.close() + if clevel == 0: + size = 1.5 * Path(oname).stat().st_size + f.close() + return size + + +def process_file(kind, prec, clevel, synth): + + if kind == "numpy": + lib = "none" + else: + lib = kind + if synth: + prefix = 'synth/synth-' + else: + prefix = 'cellzome/cellzome-' + iname = '%s/%s-%s%d-%s.h5' % (dirname, prefix, kind, clevel, prec) + f = tb.open_file(iname, "r") + a_ = f.root.maxarea + b_ = f.root.mascotscore + + oname = '%s/%s-%s%d-%s-r.h5' % (dirname, prefix, kind, clevel, prec) + f2 = tb.open_file(oname, "w") + if lib == "none": + filters = None + else: + filters = tb.Filters(complib=lib, complevel=clevel, shuffle=shuffle) + if prec == "single": + type_ = tb.Float32Atom() + else: + type_ = tb.Float64Atom() + r = f2.create_carray('/', 'r', type_, a_.shape, filters=filters) + + if kind == "numpy": + a2, b2 = a_[:], b_[:] + t0 = clock() + r = eval(expression, {'a': a2, 'b': b2}) + print(f"{clock() - t0:5.2f}") + else: + expr = tb.Expr(expression, {'a': a_, 'b': b_}) + expr.set_output(r) + expr.eval() + f.close() + f2.close() + size = Path(iname).stat().st_size + Path(oname).stat().st_size + return size + + +if __name__ == '__main__': + if len(sys.argv) > 3: + kind = sys.argv[1] + prec = sys.argv[2] + if sys.argv[3] == "synth": + synth = True + else: + synth = False + else: + print("3 parameters required") + sys.exit(1) + + # print "kind, precision, synth:", kind, prec, synth + + # print "Creating input files..." + size_orig = create_file(kind, prec, synth) + + # print "Processing files for compression levels in range(10)..." + for clevel in range(10): + t0 = clock() + ts = [] + for i in range(niter): + size = process_file(kind, prec, clevel, synth) + ts.append(clock() - t0) + t0 = clock() + ratio = size_orig / size + print(f"{min(ts):5.2f}, {ratio:5.2f}") diff --git a/bench/bsddb-table-bench.py b/bench/bsddb-table-bench.py new file mode 100644 index 0000000..d934f99 --- /dev/null +++ b/bench/bsddb-table-bench.py @@ -0,0 +1,256 @@ +#!/usr/bin/env python +###### WARNING ####### +### This script is obsoleted ### +# If you get it working again, please drop me a line +# F. Alted 2004-01-27 + +import sys +import struct +import cPickle + +import numpy as np +import tables as tb + +from bsddb import db +import psyco + + +# This class is accessible only for the examples +class Small(tb.IsDescription): + """Record descriptor. + + A record has several columns. They are represented here as class + attributes, whose names are the column names and their values will + become their types. The IsColDescr class will take care the user + will not add any new variables and that its type is correct. + + """ + + var1 = tb.StringCol(itemsize=16) + var2 = tb.Int32Col() + var3 = tb.Float64Col() + +# Define a user record to characterize some kind of particles + + +class Medium(tb.IsDescription): + name = tb.StringCol(itemsize=16, pos=0) # 16-character String + #float1 = Float64Col(shape=2, dflt=2.3) + float1 = tb.Float64Col(dflt=1.3, pos=1) + float2 = tb.Float64Col(dflt=2.3, pos=2) + ADCcount = tb.Int16Col(pos=3) # signed short integer + grid_i = tb.Int32Col(pos=4) # integer + grid_j = tb.Int32Col(pos=5) # integer + pressure = tb.Float32Col(pos=6) # float (single-precision) + energy = tb.Float64Col(pos=7) # double (double-precision) + +# Define a user record to characterize some kind of particles + + +class Big(tb.IsDescription): + name = tb.StringCol(itemsize=16) # 16-character String + #float1 = Float64Col(shape=32, dflt=np.arange(32)) + #float2 = Float64Col(shape=32, dflt=np.arange(32)) + float1 = tb.Float64Col(shape=32, dflt=range(32)) + float2 = tb.Float64Col(shape=32, dflt=[2.2] * 32) + ADCcount = tb.Int16Col() # signed short integer + grid_i = tb.Int32Col() # integer + grid_j = tb.Int32Col() # integer + pressure = tb.Float32Col() # float (single-precision) + energy = tb.Float64Col() # double (double-precision) + + +def createFile(filename, totalrows, recsize, verbose): + + # Open a 'n'ew file + dd = db.DB() + if recsize == "big": + isrec = tb.Description(Big) + elif recsize == "medium": + isrec = Medium() + else: + isrec = tb.Description(Small) + # dd.set_re_len(struct.calcsize(isrec._v_fmt)) # fixed length records + dd.open(filename, db.DB_RECNO, db.DB_CREATE | db.DB_TRUNCATE) + + rowswritten = 0 + # Get the record object associated with the new table + if recsize == "big": + isrec = Big() + arr = np.array(np.arange(32), type=np.float64) + arr2 = np.array(np.arange(32), type=np.float64) + elif recsize == "medium": + isrec = Medium() + arr = np.array(np.arange(2), type=np.float64) + else: + isrec = Small() + # print d + # Fill the table + if recsize == "big" or recsize == "medium": + d = {"name": " ", + "float1": 1.0, + "float2": 2.0, + "ADCcount": 12, + "grid_i": 1, + "grid_j": 1, + "pressure": 1.9, + "energy": 1.8, + } + for i in range(totalrows): + #d['name'] = 'Particle: %6d' % (i) + #d['TDCcount'] = i % 256 + d['ADCcount'] = (i * 256) % (1 << 16) + if recsize == "big": + #d.float1 = np.array([i]*32, np.float64) + #d.float2 = np.array([i**2]*32, np.float64) + arr[0] = 1.1 + d['float1'] = arr + arr2[0] = 2.2 + d['float2'] = arr2 + pass + else: + d['float1'] = float(i) + d['float2'] = float(i) + d['grid_i'] = i + d['grid_j'] = 10 - i + d['pressure'] = float(i * i) + d['energy'] = d['pressure'] + dd.append(cPickle.dumps(d)) +# dd.append(struct.pack(isrec._v_fmt, +# d['name'], d['float1'], d['float2'], +# d['ADCcount'], +# d['grid_i'], d['grid_j'], +# d['pressure'], d['energy'])) + else: + d = {"var1": " ", "var2": 1, "var3": 12.1e10} + for i in range(totalrows): + d['var1'] = str(i) + d['var2'] = i + d['var3'] = 12.1e10 + dd.append(cPickle.dumps(d)) + #dd.append( + # struct.pack(isrec._v_fmt, d['var1'], d['var2'], d['var3'])) + + rowswritten += totalrows + + # Close the file + dd.close() + return (rowswritten, struct.calcsize(isrec._v_fmt)) + + +def readFile(filename, recsize, verbose): + # Open the HDF5 file in read-only mode + #fileh = shelve.open(filename, "r") + dd = db.DB() + if recsize == "big": + isrec = Big() + elif recsize == "medium": + isrec = Medium() + else: + isrec = Small() + # dd.set_re_len(struct.calcsize(isrec._v_fmt)) # fixed length records + # dd.set_re_pad('-') # sets the pad character... + # dd.set_re_pad(45) # ...test both int and char + dd.open(filename, db.DB_RECNO) + if recsize == "big" or recsize == "medium": + print(isrec._v_fmt) + c = dd.cursor() + rec = c.first() + e = [] + while rec: + record = cPickle.loads(rec[1]) + #record = struct.unpack(isrec._v_fmt, rec[1]) + # if verbose: + # print record + if record['grid_i'] < 20: + e.append(record['grid_j']) + # if record[4] < 20: + # e.append(record[5]) + rec = next(c) + else: + print(isrec._v_fmt) + #e = [ t[1] for t in fileh[table] if t[1] < 20 ] + c = dd.cursor() + rec = c.first() + e = [] + while rec: + record = cPickle.loads(rec[1]) + #record = struct.unpack(isrec._v_fmt, rec[1]) + # if verbose: + # print record + if record['var2'] < 20: + e.append(record['var1']) + # if record[1] < 20: + # e.append(record[2]) + rec = next(c) + + print("resulting selection list ==>", e) + print("last record read ==>", record) + print("Total selected records ==> ", len(e)) + + # Close the file (eventually destroy the extended type) + dd.close() + + +# Add code to test here +if __name__ == "__main__": + import getopt + from time import perf_counter as clock + + usage = """usage: %s [-v] [-s recsize] [-i iterations] file + -v verbose + -s use [big] record, [medium] or [small] + -i sets the number of rows in each table\n""" % sys.argv[0] + + try: + opts, pargs = getopt.getopt(sys.argv[1:], 's:vi:') + except: + sys.stderr.write(usage) + sys.exit(0) + + # if we pass too much parameters, abort + if len(pargs) != 1: + sys.stderr.write(usage) + sys.exit(0) + + # default options + recsize = "medium" + iterations = 100 + verbose = 0 + + # Get the options + for option in opts: + if option[0] == '-s': + recsize = option[1] + if recsize not in ["big", "medium", "small"]: + sys.stderr.write(usage) + sys.exit(0) + elif option[0] == '-i': + iterations = int(option[1]) + elif option[0] == '-v': + verbose = 1 + + # Catch the hdf5 file passed as the last argument + file = pargs[0] + + t1 = clock() + psyco.bind(createFile) + (rowsw, rowsz) = createFile(file, iterations, recsize, verbose) + t2 = clock() + tapprows = t2 - t1 + + t1 = clock() + psyco.bind(readFile) + readFile(file, recsize, verbose) + t2 = clock() + treadrows = t2 - t1 + + print(f"Rows written: {rowsw}, Row size: {rowsz}") + print(f"Time appending rows: {tapprows:.3f}") + if tapprows > 0.001: + print(f"Write rows/sec: {iterations / tapprows:.0f}") + print(f"Write KB/s: {rowsw * rowsz / (tapprows * 1024):.0f}") + print(f"Time reading rows: {treadrows:.3f}") + if treadrows > 0.001: + print(f"Read rows/sec: {iterations / treadrows:.0f}") + print(f"Read KB/s: {rowsw * rowsz / (treadrows * 1024):.0f}") diff --git a/bench/cacheout.py b/bench/cacheout.py new file mode 100644 index 0000000..82e1f5c --- /dev/null +++ b/bench/cacheout.py @@ -0,0 +1,13 @@ +# Program to clean out the filesystem cache +import numpy as np + +a = np.arange(1000 * 100 * 125, dtype='f8') # 100 MB of RAM +b = a * 3 # Another 100 MB +# delete the reference to the booked memory +del a +del b + +# Do a loop to fully recharge the python interpreter +j = 2 +for i in range(1000 * 1000): + j += i * 2 diff --git a/bench/chunkshape-bench.py b/bench/chunkshape-bench.py new file mode 100644 index 0000000..6d0b7a3 --- /dev/null +++ b/bench/chunkshape-bench.py @@ -0,0 +1,58 @@ +#!/usr/bin/env python +# Benchmark the effect of chunkshapes in reading large datasets. +# You need at least PyTables 2.1 to run this! +# F. Alted + +from time import perf_counter as clock +import numpy as np +import tables as tb + +dim1, dim2 = 360, 6_109_666 +rows_to_read = range(0, 360, 36) + +print("=" * 32) +# Create the EArray +f = tb.open_file("/tmp/test.h5", "w") +a = f.create_earray(f.root, "a", tb.Float64Atom(), shape=(dim1, 0), + expectedrows=dim2) +print("Chunkshape for original array:", a.chunkshape) + +# Fill the EArray +t1 = clock() +zeros = np.zeros((dim1, 1), dtype="float64") +for i in range(dim2): + a.append(zeros) +tcre = clock() - t1 +thcre = dim1 * dim2 * 8 / (tcre * 1024 * 1024) +print(f"Time to append {a.nrows} rows: {tcre:.3f} sec ({thcre:.1f} MB/s)") + +# Read some row vectors from the original array +t1 = clock() +for i in rows_to_read: + r1 = a[i, :] +tr1 = clock() - t1 +thr1 = dim2 * len(rows_to_read) * 8 / (tr1 * 1024 * 1024) +print(f"Time to read ten rows in original array: {tr1:.3f} sec ({thr1:.1f} MB/s)") + +print("=" * 32) +# Copy the array to another with a row-wise chunkshape +t1 = clock() +#newchunkshape = (1, a.chunkshape[0]*a.chunkshape[1]) +newchunkshape = (1, a.chunkshape[0] * a.chunkshape[1] * 10) # ten times larger +b = a.copy(f.root, "b", chunkshape=newchunkshape) +tcpy = clock() - t1 +thcpy = dim1 * dim2 * 8 / (tcpy * 1024 * 1024) +print("Chunkshape for row-wise chunkshape array:", b.chunkshape) +print(f"Time to copy the original array: {tcpy:.3f} sec ({thcpy:.1f} MB/s)") + +# Read the same ten rows from the new copied array +t1 = clock() +for i in rows_to_read: + r2 = b[i, :] +tr2 = clock() - t1 +thr2 = dim2 * len(rows_to_read) * 8 / (tr2 * 1024 * 1024) +print(f"Time to read with a row-wise chunkshape: {tr2:.3f} sec ({thr2:.1f} MB/s)") +print("=" * 32) +print(f"Speed-up with a row-wise chunkshape: {tr1 / tr2:.1f}") + +f.close() diff --git a/bench/chunkshape-testing.py b/bench/chunkshape-testing.py new file mode 100644 index 0000000..84e7587 --- /dev/null +++ b/bench/chunkshape-testing.py @@ -0,0 +1,107 @@ +#!/usr/bin/env python + +"""Simple benchmark for testing chunkshapes and nrowsinbuf.""" + +import numpy as np +import tables as tb +from time import perf_counter as clock + +L = 20 +N = 2000 +M = 30 +complevel = 1 + +recarray = np.empty(shape=2, dtype='(2,2,2)i4,(2,3,3)f8,i4,i8') + +f = tb.open_file("chunkshape.h5", mode="w") + +# t = f.create_table(f.root, 'table', recarray, "mdim recarray") + +# a0 = f.create_array(f.root, 'field0', recarray['f0'], "mdim int32 array") +# a1 = f.create_array(f.root, 'field1', recarray['f1'], "mdim float64 array") + +# c0 = f.create_carray(f.root, 'cfield0', +# tables.Int32Atom(), (2,2,2), +# "mdim int32 carray") +# c1 = f.create_carray(f.root, 'cfield1', +# tables.Float64Atom(), (2,3,3), +# "mdim float64 carray") + +f1 = tb.open_file("chunkshape1.h5", mode="w") +c1 = f.create_carray(f1.root, 'cfield1', + tb.Int32Atom(), (L, N, M), + "scalar int32 carray", tb.Filters(complevel=0)) + +t1 = clock() +c1[:] = np.empty(shape=(L, 1, 1), dtype="int32") +print("carray1 populate time:", clock() - t1) +f1.close() + + +f2 = tb.open_file("chunkshape2.h5", mode="w") +c2 = f.create_carray(f2.root, 'cfield2', + tb.Int32Atom(), (L, M, N), + "scalar int32 carray", tb.Filters(complevel)) + +t1 = clock() +c2[:] = np.empty(shape=(L, 1, 1), dtype="int32") +print("carray2 populate time:", clock() - t1) +f2.close() + +f0 = tb.open_file("chunkshape0.h5", mode="w") +e0 = f.create_earray(f0.root, 'efield0', + tb.Int32Atom(), (0, L, M), + "scalar int32 carray", tb.Filters(complevel), + expectedrows=N) + +t1 = clock() +e0.append(np.empty(shape=(N, L, M), dtype="int32")) +print("earray0 populate time:", clock() - t1) +f0.close() + +f1 = tb.open_file("chunkshape1.h5", mode="w") +e1 = f.create_earray(f1.root, 'efield1', + tb.Int32Atom(), (L, 0, M), + "scalar int32 carray", tb.Filters(complevel), + expectedrows=N) + +t1 = clock() +e1.append(np.empty(shape=(L, N, M), dtype="int32")) +print("earray1 populate time:", clock() - t1) +f1.close() + + +f2 = tb.open_file("chunkshape2.h5", mode="w") +e2 = f.create_earray(f2.root, 'efield2', + tb.Int32Atom(), (L, M, 0), + "scalar int32 carray", tb.Filters(complevel), + expectedrows=N) + +t1 = clock() +e2.append(np.empty(shape=(L, M, N), dtype="int32")) +print("earray2 populate time:", clock() - t1) +f2.close() + +# t1=time() +# c2[:] = numpy.empty(shape=(M, N), dtype="int32") +# print "carray populate time:", time()-t1 + +# f3 = f.create_carray(f.root, 'cfield3', +# tables.Float64Atom(), (3,), +# "scalar float64 carray", chunkshape=(32,)) + +# e2 = f.create_earray(f.root, 'efield2', +# tables.Int32Atom(), (0, M), +# "scalar int32 carray", expectedrows=N) +# t1=time() +# e2.append(numpy.empty(shape=(N, M), dtype="int32")) +# print "earray populate time:", time()-t1 + +# t1=time() +# c2._f_copy(newname='cfield2bis') +# print "carray copy time:", time()-t1 +# t1=time() +# e2._f_copy(newname='efield2bis') +# print "earray copy time:", time()-t1 + +f.close() diff --git a/bench/collations.py b/bench/collations.py new file mode 100644 index 0000000..c631c3b --- /dev/null +++ b/bench/collations.py @@ -0,0 +1,123 @@ +import numpy as np +import tables as tb +from time import perf_counter as clock + +N = 1000 * 1000 +NCOLL = 200 # 200 collections maximum + +# In order to have reproducible results +np.random.seed(19) + + +class Energies(tb.IsDescription): + collection = tb.UInt8Col() + energy = tb.Float64Col() + + +def fill_bucket(lbucket): + #c = np.random.normal(NCOLL/2, NCOLL/10, lbucket) + c = np.random.normal(NCOLL / 2, NCOLL / 100, lbucket) + e = np.arange(lbucket, dtype='f8') + return c, e + +# Fill the table +t1 = clock() +f = tb.open_file("data.nobackup/collations.h5", "w") +table = f.create_table("/", "Energies", Energies, expectedrows=N) +# Fill the table with values +lbucket = 1000 # Fill in buckets of 1000 rows, for speed +for i in range(0, N, lbucket): + bucket = fill_bucket(lbucket) + table.append(bucket) +# Fill the remaining rows +bucket = fill_bucket(N % lbucket) +table.append(bucket) +f.close() +print(f"Time to create the table with {N} entries: {t1:.3f}") + +# Now, read the table and group it by collection +f = tb.open_file("data.nobackup/collations.h5", "a") +table = f.root.Energies + +######################################################### +# First solution: load the table completely in memory +######################################################### +t1 = clock() +t = table[:] # convert to structured array +coll1 = [] +collections = np.unique(t['collection']) +for c in collections: + cond = t['collection'] == c + energy_this_collection = t['energy'][cond] + sener = energy_this_collection.sum() + coll1.append(sener) + print(c, ' : ', sener) +del collections, energy_this_collection +print(f"Time for first solution: {clock() - t1:.3f}s") + +######################################################### +# Second solution: load all the collections in memory +######################################################### +t1 = clock() +collections = {} +for row in table: + c = row['collection'] + e = row['energy'] + if c in collections: + collections[c].append(e) + else: + collections[c] = [e] +# Convert the lists in numpy arrays +coll2 = [] +for c in sorted(collections): + energy_this_collection = np.array(collections[c]) + sener = energy_this_collection.sum() + coll2.append(sener) + print(c, ' : ', sener) +del collections, energy_this_collection +print(f"Time for second solution: {clock() - t1:.3f}s") + +t1 = clock() +table.cols.collection.create_csindex() +# table.cols.collection.reindex() +print(f"Time for indexing: {clock() - t1:.3f}s") + +######################################################### +# Third solution: load each collection separately +######################################################### +t1 = clock() +coll3 = [] +for c in np.unique(table.col('collection')): + energy_this_collection = table.read_where( + 'collection == c', field='energy') + sener = energy_this_collection.sum() + coll3.append(sener) + print(c, ' : ', sener) +del energy_this_collection +print(f"Time for third solution: {clock() - t1:.3f}s") + + +t1 = clock() +table2 = table.copy('/', 'EnergySortedByCollation', overwrite=True, + sortby="collection", propindexes=True) +print(f"Time for sorting: {clock() - t1:.3f}s") + +##################################################################### +# Fourth solution: load each collection separately. Sorted table. +##################################################################### +t1 = clock() +coll4 = [] +for c in np.unique(table2.col('collection')): + energy_this_collection = table2.read_where( + 'collection == c', field='energy') + sener = energy_this_collection.sum() + coll4.append(sener) + print(c, ' : ', sener) + del energy_this_collection +print(f"Time for fourth solution: {clock() - t1:.3f}s") + + +# Finally, check that all solutions do match +assert coll1 == coll2 == coll3 == coll4 + +f.close() diff --git a/bench/copy-bench.py b/bench/copy-bench.py new file mode 100644 index 0000000..bf2522d --- /dev/null +++ b/bench/copy-bench.py @@ -0,0 +1,32 @@ +import sys +from time import perf_counter as clock + +import tables as tb + +if len(sys.argv) != 3: + print("usage: %s source_file dest_file", sys.argv[0]) +filesrc = sys.argv[1] +filedest = sys.argv[2] +filehsrc = tb.open_file(filesrc) +filehdest = tb.open_file(filedest, 'w') +ntables = 0 +tsize = 0 +t1 = clock() +for group in filehsrc.walk_groups(): + if isinstance(group._v_parent, tb.File): + groupdest = filehdest.root + else: + pathname = group._v_parent._v_pathname + groupdest = filehdest.create_group(pathname, group._v_name, + title=group._v_title) + for table in filehsrc.list_nodes(group, classname='Table'): + print("copying table -->", table) + table.copy(groupdest, table.name) + ntables += 1 + tsize += table.nrows * table.rowsize +tsizeMB = tsize / (1024 * 1024) +ttime = clock() - t1 +print(f"Copied {ntables} tables for a total of {tsizeMB:.1f} MB" + f" in {ttime:.3f} seconds ({tsizeMB / ttime:.1f} MB/s)") +filehsrc.close() +filehdest.close() diff --git a/bench/create-large-number-objects.py b/bench/create-large-number-objects.py new file mode 100644 index 0000000..4695904 --- /dev/null +++ b/bench/create-large-number-objects.py @@ -0,0 +1,42 @@ +"This creates an HDF5 file with a potentially large number of objects" + +import sys +import numpy as np +import tables as tb + +filename = sys.argv[1] + +# Open a new empty HDF5 file +fileh = tb.open_file(filename, mode="w") + +# nlevels -- Number of levels in hierarchy +# ngroups -- Number of groups on each level +# ndatasets -- Number of arrays on each group +# LR: Low ratio groups/datasets +#nlevels, ngroups, ndatasets = (3, 1, 1000) +# MR: Medium ratio groups/datasets +nlevels, ngroups, ndatasets = (3, 10, 100) +#nlevels, ngroups, ndatasets = (3, 5, 10) +# HR: High ratio groups/datasets +#nlevels, ngroups, ndatasets = (30, 10, 10) + +# Create an Array to save on disk +a = np.array([-1, 2, 4], np.int16) + +group = fileh.root +group2 = fileh.root +for k in range(nlevels): + for j in range(ngroups): + for i in range(ndatasets): + # Save the array on the HDF5 file + fileh.create_array(group2, 'array' + str(i), + a, "Signed short array") + # Create a new group + group2 = fileh.create_group(group, 'group' + str(j)) + # Create a new group + group3 = fileh.create_group(group, 'ngroup' + str(k)) + # Iterate over this new group (group3) + group = group3 + group2 = group3 + +fileh.close() diff --git a/bench/deep-tree-h5py.py b/bench/deep-tree-h5py.py new file mode 100644 index 0000000..0ad9d1f --- /dev/null +++ b/bench/deep-tree-h5py.py @@ -0,0 +1,113 @@ +from pathlib import Path +from time import perf_counter as clock +import random +import numpy as np +import h5py + +random.seed(2) + + +def show_stats(explain, tref): + "Show the used memory (only works for Linux 2.6.x)." + for line in Path('/proc/self/status').read_text().splitlines(): + if line.startswith("VmSize:"): + vmsize = int(line.split()[1]) + elif line.startswith("VmRSS:"): + vmrss = int(line.split()[1]) + elif line.startswith("VmData:"): + vmdata = int(line.split()[1]) + elif line.startswith("VmStk:"): + vmstk = int(line.split()[1]) + elif line.startswith("VmExe:"): + vmexe = int(line.split()[1]) + elif line.startswith("VmLib:"): + vmlib = int(line.split()[1]) + print("Memory usage: ******* %s *******" % explain) + print(f"VmSize: {vmsize:>7} kB\tVmRSS: {vmrss:>7} kB") + print(f"VmData: {vmdata:>7} kB\tVmStk: {vmstk:>7} kB") + print(f"VmExe: {vmexe:>7} kB\tVmLib: {vmlib:>7} kB") + tnow = clock() + print(f"WallClock time: {tnow - tref:.3f}") + return tnow + + +def populate(f, nlevels): + g = f + arr = np.zeros((10,), "f4") + for i in range(nlevels): + g["DS1"] = arr + g["DS2"] = arr + g.create_group('group2_') + g = g.create_group('group') + + +def getnode(f, nlevels, niter, range_): + for i in range(niter): + nlevel = random.randrange( + (nlevels - range_) / 2, (nlevels + range_) / 2) + groupname = "" + for i in range(nlevel): + groupname += "/group" + groupname += "/DS1" + f[groupname] + + +if __name__ == '__main__': + nlevels = 1024 + niter = 1000 + range_ = 256 + profile = True + doprofile = True + verbose = False + + if doprofile: + import pstats + import cProfile as prof + + if profile: + tref = clock() + if profile: + show_stats("Abans de crear...", tref) + f = h5py.File("/tmp/deep-tree.h5", 'w') + if doprofile: + prof.run('populate(f, nlevels)', 'populate.prof') + stats = pstats.Stats('populate.prof') + stats.strip_dirs() + stats.sort_stats('time', 'calls') + if verbose: + stats.print_stats() + else: + stats.print_stats(20) + else: + populate(f, nlevels) + f.close() + if profile: + show_stats("Despres de crear", tref) + +# if profile: tref = time() +# if profile: show_stats("Abans d'obrir...", tref) +# f = h5py.File("/tmp/deep-tree.h5", 'r') +# if profile: show_stats("Abans d'accedir...", tref) +# if doprofile: +# prof.run('getnode(f, nlevels, niter, range_)', 'deep-tree.prof') +# stats = pstats.Stats('deep-tree.prof') +# stats.strip_dirs() +# stats.sort_stats('time', 'calls') +# if verbose: +# stats.print_stats() +# else: +# stats.print_stats(20) +# else: +# getnode(f, nlevels, niter, range_) +# if profile: show_stats("Despres d'accedir", tref) +# f.close() +# if profile: show_stats("Despres de tancar", tref) + +# f = h5py.File("/tmp/deep-tree.h5", 'r') +# g = f +# for i in range(nlevels): +# dset = g["DS1"] +# dset = g["DS2"] +# group2 = g['group2_'] +# g = g['group'] +# f.close() diff --git a/bench/deep-tree.py b/bench/deep-tree.py new file mode 100644 index 0000000..b2a43ec --- /dev/null +++ b/bench/deep-tree.py @@ -0,0 +1,122 @@ +# Small benchmark for compare creation times with parameter +# PYTABLES_SYS_ATTRS active or not. + +from pathlib import Path +from time import perf_counter as clock +import random +import tables as tb + +random.seed(2) + + +def show_stats(explain, tref): + "Show the used memory (only works for Linux 2.6.x)." + for line in Path('/proc/self/status').read_text().splitlines(): + if line.startswith("VmSize:"): + vmsize = int(line.split()[1]) + elif line.startswith("VmRSS:"): + vmrss = int(line.split()[1]) + elif line.startswith("VmData:"): + vmdata = int(line.split()[1]) + elif line.startswith("VmStk:"): + vmstk = int(line.split()[1]) + elif line.startswith("VmExe:"): + vmexe = int(line.split()[1]) + elif line.startswith("VmLib:"): + vmlib = int(line.split()[1]) + print("Memory usage: ******* %s *******" % explain) + print(f"VmSize: {vmsize:>7} kB\tVmRSS: {vmrss:>7} kB") + print(f"VmData: {vmdata:>7} kB\tVmStk: {vmstk:>7} kB") + print(f"VmExe: {vmexe:>7} kB\tVmLib: {vmlib:>7} kB") + tnow = clock() + print(f"WallClock time: {tnow - tref:.3f}") + return tnow + + +def populate(f, nlevels): + g = f.root + #arr = numpy.zeros((10,), "f4") + #descr = {'f0': tables.Int32Col(), 'f1': tables.Float32Col()} + for i in range(nlevels): + #dset = f.create_array(g, "DS1", arr) + #dset = f.create_array(g, "DS2", arr) + f.create_carray(g, "DS1", tb.IntAtom(), (10,)) + f.create_carray(g, "DS2", tb.IntAtom(), (10,)) + #dset = f.create_table(g, "DS1", descr) + #dset = f.create_table(g, "DS2", descr) + f.create_group(g, 'group2_') + g = f.create_group(g, 'group') + + +def getnode(f, nlevels, niter, range_): + for i in range(niter): + nlevel = random.randrange( + (nlevels - range_) / 2, (nlevels + range_) / 2) + groupname = "" + for i in range(nlevel): + groupname += "/group" + groupname += "/DS1" + f.get_node(groupname) + + +if __name__ == '__main__': + nlevels = 1024 + niter = 256 + range_ = 128 + nodeCacheSlots = 64 + pytables_sys_attrs = True + profile = True + doprofile = True + verbose = False + + if doprofile: + import pstats + import cProfile as prof + + if profile: + tref = clock() + if profile: + show_stats("Abans de crear...", tref) + f = tb.open_file("/tmp/PTdeep-tree.h5", 'w', + node_cache_slots=nodeCacheSlots, + pytables_sys_attrs=pytables_sys_attrs) + if doprofile: + prof.run('populate(f, nlevels)', 'populate.prof') + stats = pstats.Stats('populate.prof') + stats.strip_dirs() + stats.sort_stats('time', 'calls') + if verbose: + stats.print_stats() + else: + stats.print_stats(20) + else: + populate(f, nlevels) + f.close() + if profile: + show_stats("Despres de crear", tref) + + if profile: + tref = clock() + if profile: + show_stats("Abans d'obrir...", tref) + f = tb.open_file("/tmp/PTdeep-tree.h5", 'r', + node_cache_slots=nodeCacheSlots, + pytables_sys_attrs=pytables_sys_attrs) + if profile: + show_stats("Abans d'accedir...", tref) + if doprofile: + prof.run('getnode(f, nlevels, niter, range_)', 'getnode.prof') + stats = pstats.Stats('getnode.prof') + stats.strip_dirs() + stats.sort_stats('time', 'calls') + if verbose: + stats.print_stats() + else: + stats.print_stats(20) + else: + getnode(f, nlevels, niter, range_) + if profile: + show_stats("Despres d'accedir", tref) + f.close() + if profile: + show_stats("Despres de tancar", tref) diff --git a/bench/evaluate.py b/bench/evaluate.py new file mode 100644 index 0000000..4d30973 --- /dev/null +++ b/bench/evaluate.py @@ -0,0 +1,175 @@ +import sys +from time import perf_counter as clock + +import numexpr as ne +import numpy as np + +import tables as tb + + +shape = (1000, 160_000) +#shape = (10,1600) +filters = tb.Filters(complevel=1, complib="blosc", shuffle=0) +ofilters = tb.Filters(complevel=1, complib="blosc", shuffle=0) +#filters = tb.Filters(complevel=1, complib="lzo", shuffle=0) +#ofilters = tb.Filters(complevel=1, complib="lzo", shuffle=0) + +# TODO: Makes it sense to add a 's'tring typecode here? +typecode_to_dtype = {'b': 'bool', 'i': 'int32', 'l': 'int64', 'f': 'float32', + 'd': 'float64', 'c': 'complex128'} + + +def _compute(result, function, arguments, + start=None, stop=None, step=None): + """Compute the `function` over the `arguments` and put the outcome in + `result`""" + arg0 = arguments[0] + if hasattr(arg0, 'maindim'): + maindim = arg0.maindim + (start, stop, step) = arg0._process_range_read(start, stop, step) + nrowsinbuf = arg0.nrowsinbuf + print("nrowsinbuf-->", nrowsinbuf) + else: + maindim = 0 + (start, stop, step) = (0, len(arg0), 1) + nrowsinbuf = len(arg0) + shape = list(arg0.shape) + shape[maindim] = len(range(start, stop, step)) + + # The slices parameter for arg0.__getitem__ + slices = [slice(0, dim, 1) for dim in arg0.shape] + + # This is a hack to prevent doing unnecessary conversions + # when copying buffers + if hasattr(arg0, 'maindim'): + for arg in arguments: + arg._v_convert = False + + # Start the computation itself + for start2 in range(start, stop, step * nrowsinbuf): + # Save the records on disk + stop2 = start2 + step * nrowsinbuf + if stop2 > stop: + stop2 = stop + # Set the proper slice in the main dimension + slices[maindim] = slice(start2, stop2, step) + start3 = (start2 - start) / step + stop3 = start3 + nrowsinbuf + if stop3 > shape[maindim]: + stop3 = shape[maindim] + # Compute the slice to be filled in destination + sl = [] + for i in range(maindim): + sl.append(slice(None, None, None)) + sl.append(slice(start3, stop3, None)) + # Get the values for computing the buffer + values = [arg.__getitem__(tuple(slices)) for arg in arguments] + result[tuple(sl)] = function(*values) + + # Activate the conversion again (default) + if hasattr(arg0, 'maindim'): + for arg in arguments: + arg._v_convert = True + + return result + + +def evaluate(ex, out=None, local_dict=None, global_dict=None, **kwargs): + """Evaluate expression and return an array.""" + + # First, get the signature for the arrays in expression + context = ne.necompiler.getContext(kwargs) + names, _ = ne.necompiler.getExprNames(ex, context) + + # Get the arguments based on the names. + call_frame = sys._getframe(1) + if local_dict is None: + local_dict = call_frame.f_locals + if global_dict is None: + global_dict = call_frame.f_globals + arguments = [] + types = [] + for name in names: + try: + a = local_dict[name] + except KeyError: + a = global_dict[name] + arguments.append(a) + if hasattr(a, 'atom'): + types.append(a.atom) + else: + types.append(a) + + # Create a signature + signature = [ + (name, ne.necompiler.getType(type_)) + for (name, type_) in zip(names, types) + ] + print("signature-->", signature) + + # Compile the expression + compiled_ex = ne.necompiler.NumExpr(ex, signature, [], **kwargs) + print("fullsig-->", compiled_ex.fullsig) + + _compute(out, compiled_ex, arguments) + + return + + +if __name__ == "__main__": + iarrays = 0 + oarrays = 0 + doprofile = 1 + dokprofile = 0 + + f = tb.open_file("/scratch2/faltet/evaluate.h5", "w") + + # Create some arrays + if iarrays: + a = np.ones(shape, dtype='float32') + b = np.ones(shape, dtype='float32') * 2 + c = np.ones(shape, dtype='float32') * 3 + else: + a = f.create_carray(f.root, 'a', tb.Float32Atom(dflt=1), + shape=shape, filters=filters) + a[:] = 1 + b = f.create_carray(f.root, 'b', tb.Float32Atom(dflt=2), + shape=shape, filters=filters) + b[:] = 2 + c = f.create_carray(f.root, 'c', tb.Float32Atom(dflt=3), + shape=shape, filters=filters) + c[:] = 3 + if oarrays: + out = np.empty(shape, dtype='float32') + else: + out = f.create_carray(f.root, 'out', tb.Float32Atom(), + shape=shape, filters=ofilters) + + t0 = clock() + if iarrays and oarrays: + #out = ne.evaluate("a*b+c") + out = a * b + c + elif doprofile: + import cProfile as prof + import pstats + prof.run('evaluate("a*b+c", out)', 'evaluate.prof') + stats = pstats.Stats('evaluate.prof') + stats.strip_dirs() + stats.sort_stats('time', 'calls') + stats.print_stats(20) + elif dokprofile: + from cProfile import Profile + import lsprofcalltree + prof = Profile() + prof.run('evaluate("a*b+c", out)') + kcg = lsprofcalltree.KCacheGrind(prof) + with Path('evaluate.kcg').open('w') as ofile: + kcg.output(ofile) + else: + evaluate("a*b+c", out) + print(f"Time for evaluate--> {clock() - t0:.3f}") + + # print "out-->", `out` + # print `out[:]` + + f.close() diff --git a/bench/expression.py b/bench/expression.py new file mode 100644 index 0000000..0a4fc56 --- /dev/null +++ b/bench/expression.py @@ -0,0 +1,175 @@ +from pathlib import Path +from time import perf_counter as clock + +import numpy as np +import tables as tb + +OUT_DIR = Path("/scratch2/faltet/") # the directory for data output + +shape = (1000, 1000 * 1000) # shape for input arrays +expr = "a*b+1" # Expression to be computed + +nrows, ncols = shape + + +def tables(docompute, dowrite, complib, verbose): + + # Filenames + ifilename = OUT_DIR / "expression-inputs.h5" + ofilename = OUT_DIR / "expression-outputs.h5" + + # Filters + shuffle = True + if complib == 'blosc': + filters = tb.Filters(complevel=1, complib='blosc', shuffle=shuffle) + elif complib == 'lzo': + filters = tb.Filters(complevel=1, complib='lzo', shuffle=shuffle) + elif complib == 'zlib': + filters = tb.Filters(complevel=1, complib='zlib', shuffle=shuffle) + else: + filters = tb.Filters(complevel=0, shuffle=False) + if verbose: + print("Will use filters:", filters) + + if dowrite: + f = tb.open_file(ifilename, 'w') + + # Build input arrays + t0 = clock() + root = f.root + a = f.create_carray(root, 'a', tb.Float32Atom(), + shape, filters=filters) + b = f.create_carray(root, 'b', tb.Float32Atom(), + shape, filters=filters) + if verbose: + print("chunkshape:", a.chunkshape) + print("chunksize:", np.prod(a.chunkshape) * a.dtype.itemsize) + #row = np.linspace(0, 1, ncols) + row = np.arange(0, ncols, dtype='float32') + for i in range(nrows): + a[i] = row * (i + 1) + b[i] = row * (i + 1) * 2 + f.close() + print(f"[tables.Expr] Time for creating inputs: {clock() - t0:.3f}") + + if docompute: + f = tb.open_file(ifilename, 'r') + fr = tb.open_file(ofilename, 'w') + a = f.root.a + b = f.root.b + r1 = f.create_carray(fr.root, 'r1', tb.Float32Atom(), shape, + filters=filters) + # The expression + e = tb.Expr(expr) + e.set_output(r1) + t0 = clock() + e.eval() + if verbose: + print("First ten values:", r1[0, :10]) + f.close() + fr.close() + print(f"[tables.Expr] Time for computing & save: {clock() - t0:.3f}") + + +def memmap(docompute, dowrite, verbose): + + afilename = OUT_DIR / "memmap-a.bin" + bfilename = OUT_DIR / "memmap-b.bin" + rfilename = OUT_DIR / "memmap-output.bin" + if dowrite: + t0 = clock() + a = np.memmap(afilename, dtype='float32', mode='w+', shape=shape) + b = np.memmap(bfilename, dtype='float32', mode='w+', shape=shape) + + # Fill arrays a and b + #row = np.linspace(0, 1, ncols) + row = np.arange(0, ncols, dtype='float32') + for i in range(nrows): + a[i] = row * (i + 1) + b[i] = row * (i + 1) * 2 + del a, b # flush data + print(f"[numpy.memmap] Time for creating inputs: {clock() - t0:.3f}") + + if docompute: + t0 = clock() + # Reopen inputs in read-only mode + a = np.memmap(afilename, dtype='float32', mode='r', shape=shape) + b = np.memmap(bfilename, dtype='float32', mode='r', shape=shape) + # Create the array output + r = np.memmap(rfilename, dtype='float32', mode='w+', shape=shape) + # Do the computation row by row + for i in range(nrows): + r[i] = eval(expr, {'a': a[i], 'b': b[i]}) + if verbose: + print("First ten values:", r[0, :10]) + del a, b + del r # flush output data + print(f"[numpy.memmap] Time for compute & save: {clock() - t0:.3f}") + + +def do_bench(what, documpute, dowrite, complib, verbose): + if what == "tables": + tables(docompute, dowrite, complib, verbose) + if what == "memmap": + memmap(docompute, dowrite, verbose) + + +if __name__ == "__main__": + import sys + import getopt + + usage = """usage: %s [-T] [-M] [-c] [-w] [-v] [-z complib] + -T use tables.Expr + -M use numpy.memmap + -c do the computation only + -w write inputs only + -v verbose mode + -z select compression library ('zlib' or 'lzo'). Default is None. +""" % sys.argv[0] + + try: + opts, pargs = getopt.getopt(sys.argv[1:], 'TMcwvz:') + except: + sys.stderr.write(usage) + sys.exit(1) + + # default options + usepytables = False + usememmap = False + docompute = False + dowrite = False + verbose = False + complib = None + + # Get the options + for option in opts: + if option[0] == '-T': + usepytables = True + elif option[0] == '-M': + usememmap = True + elif option[0] == '-c': + docompute = True + elif option[0] == '-w': + dowrite = True + elif option[0] == '-v': + verbose = True + elif option[0] == '-z': + complib = option[1] + if complib not in ('blosc', 'lzo', 'zlib'): + print("complib must be 'lzo' or 'zlib' " + "and you passed: '%s'" % complib) + sys.exit(1) + + # If not a backend selected, abort + if not usepytables and not usememmap: + print("Please select a backend:") + print("PyTables.Expr: -T") + print("NumPy.memmap: -M") + sys.exit(1) + + # Select backend and do the benchmark + if usepytables: + what = "tables" + if usememmap: + what = "memmap" + do_bench(what, docompute, dowrite, complib, verbose) diff --git a/bench/get-figures-ranges.py b/bench/get-figures-ranges.py new file mode 100644 index 0000000..7dc8d0e --- /dev/null +++ b/bench/get-figures-ranges.py @@ -0,0 +1,229 @@ +from pathlib import Path +from pylab import * + +linewidth = 2 +#markers= ['+', ',', 'o', '.', 's', 'v', 'x', '>', '<', '^'] +#markers= [ 'x', '+', 'o', 's', 'v', '^', '>', '<', ] +markers = ['s', 'o', 'v', '^', '+', 'x', '>', '<', ] +markersize = 8 + + +def get_values(filename): + sizes = [] + values = [] + isize = None + for line in Path(filename).read_text().splitlines(): + if line.startswith('range'): + tmp = line.split(':')[1] + tmp = tmp.strip() + tmp = tmp[1:-1] + lower, upper = int(tmp.split(',')[0]), int(tmp.split(',')[1]) + isize = upper - lower + # print "isize-->", isize + if isize is None or isize == 0: + continue + if insert and line.startswith('Insert time'): + tmp = line.split(':')[1] + #itime = float(tmp[:tmp.index(',')]) + itime = float(tmp) + sizes.append(isize) + values.append(itime) + elif line.startswith('Index time'): + tmp = line.split(':')[1] + #xtime = float(tmp[:tmp.index(',')]) + xtime = float(tmp) + txtime += xtime + if create_index and create_index in line: + sizes.append(isize) + values.append(xtime) + elif create_total and txtime > xtime: + sizes.append(isize) + values.append(txtime) + elif table_size and line.startswith('Table size'): + tsize = float(line.split(':')[1]) + sizes.append(isize) + values.append(tsize) + elif indexes_size and line.startswith('Indexes size'): + xsize = float(line.split(':')[1]) + sizes.append(isize) + values.append(xsize) + elif total_size and line.startswith('Full size'): + fsize = float(line.split(':')[1]) + sizes.append(isize) + values.append(fsize) + elif ((query or query_cold or query_warm) and + line.startswith('[NOREP]')): + tmp = line.split(':')[1] + try: + qtime = float(tmp[:tmp.index('+-')]) + except ValueError: + qtime = float(tmp) + if colname in line: + if query and '1st' in line: + sizes.append(isize) + values.append(qtime) + elif query_cold and 'cold' in line: + sizes.append(isize) + values.append(qtime) + elif query_warm and 'warm' in line: + sizes.append(isize) + values.append(qtime) + return sizes, values + + +def show_plot(plots, yaxis, legends, gtitle): + xlabel('Number of hits') + ylabel(yaxis) + title(gtitle) + #ylim(0, 100) + grid(True) + +# legends = [f[f.find('-'):f.index('.out')] for f in filenames] +# legends = [l.replace('-', ' ') for l in legends] + #legend([p[0] for p in plots], legends, loc = "upper left") + legend([p[0] for p in plots], legends, loc="best") + + #subplots_adjust(bottom=0.2, top=None, wspace=0.2, hspace=0.2) + if outfile: + savefig(outfile) + else: + show() + +if __name__ == '__main__': + + import sys + import getopt + + usage = """usage: %s [-o file] [-t title] [--insert] [--create-index] [--create-total] [--table-size] [--indexes-size] [--total-size] [--query=colname] [--query-cold=colname] [--query-warm=colname] files + -o filename for output (only .png and .jpg extensions supported) + -t title of the plot + --insert -- Insert time for table + --create-index=colname -- Index time for column + --create-total -- Total time for creation of table + indexes + --table-size -- Size of table + --indexes-size -- Size of all indexes + --total-size -- Total size of table + indexes + --query=colname -- Time for querying the specified column + --query-cold=colname -- Time for querying the specified column (cold cache) + --query-warm=colname -- Time for querying the specified column (warm cache) + \n""" % sys.argv[0] + + try: + opts, pargs = getopt.getopt(sys.argv[1:], 'o:t:', + ['insert', + 'create-index=', + 'create-total', + 'table-size', + 'indexes-size', + 'total-size', + 'query=', + 'query-cold=', + 'query-warm=', + ]) + except: + sys.stderr.write(usage) + sys.exit(0) + + progname = sys.argv[0] + args = sys.argv[1:] + + # if we pass too few parameters, abort + if len(pargs) < 1: + sys.stderr.write(usage) + sys.exit(0) + + # default options + outfile = None + insert = 0 + create_index = None + create_total = 0 + table_size = 0 + indexes_size = 0 + total_size = 0 + query = 0 + query_cold = 0 + query_warm = 0 + colname = None + yaxis = "No axis name" + tit = None + gtitle = "Please set a title!" + + # Get the options + for option in opts: + if option[0] == '-o': + outfile = option[1] + elif option[0] == '-t': + tit = option[1] + elif option[0] == '--insert': + insert = 1 + yaxis = "Time (s)" + gtitle = "Insert time for table" + elif option[0] == '--create-index': + create_index = option[1] + yaxis = "Time (s)" + gtitle = "Create index time for column " + create_index + elif option[0] == '--create-total': + create_total = 1 + yaxis = "Time (s)" + gtitle = "Create time for table + indexes" + elif option[0] == '--table-size': + table_size = 1 + yaxis = "Size (MB)" + gtitle = "Table size" + elif option[0] == '--indexes-size': + indexes_size = 1 + yaxis = "Size (MB)" + gtitle = "Indexes size" + elif option[0] == '--total-size': + total_size = 1 + yaxis = "Size (MB)" + gtitle = "Total size (table + indexes)" + elif option[0] == '--query': + query = 1 + colname = option[1] + yaxis = "Time (s)" + gtitle = "Query time for " + colname + " column (first query)" + elif option[0] == '--query-cold': + query_cold = 1 + colname = option[1] + yaxis = "Time (s)" + gtitle = "Query time for " + colname + " column (cold cache)" + elif option[0] == '--query-warm': + query_warm = 1 + colname = option[1] + yaxis = "Time (s)" + gtitle = "Query time for " + colname + " column (warm cache)" + + filenames = pargs + + if tit: + gtitle = tit + + plots = [] + legends = [] + for filename in filenames: + plegend = filename[filename.find('-'):filename.index('.out')] + plegend = plegend.replace('-', ' ') + xval, yval = get_values(filename) + print(f"Values for {filename} --> {xval}, {yval}") + if "PyTables" in filename or "pytables" in filename: + plot = loglog(xval, yval, linewidth=2) + #plot = semilogx(xval, yval, linewidth=2) + plots.append(plot) + setp(plot, marker=markers[0], markersize=markersize, + linewidth=linewidth) + else: + plots.append(loglog(xval, yval, linewidth=3, color='m')) + #plots.append(semilogx(xval, yval, linewidth=3, color='m')) + #plots.append(semilogx(xval, yval, linewidth=5)) + legends.append(plegend) + if 0: # Per a introduir dades simulades si es vol... + xval = [1000, 10_000, 100_000, 1_000_000, 10_000_000, + 100_000_000, 1_000_000_000] +# yval = [0.003, 0.005, 0.02, 0.06, 1.2, +# 40, 210] + yval = [0.0009, 0.0011, 0.0022, 0.005, 0.02, + 0.2, 5.6] + plots.append(loglog(xval, yval, linewidth=5)) + legends.append("PyTables Std") + show_plot(plots, yaxis, legends, gtitle) diff --git a/bench/get-figures.py b/bench/get-figures.py new file mode 100644 index 0000000..ca1ecf1 --- /dev/null +++ b/bench/get-figures.py @@ -0,0 +1,288 @@ +from pylab import * + +linewidth = 2 +#markers= ['+', ',', 'o', '.', 's', 'v', 'x', '>', '<', '^'] +#markers= [ 'x', '+', 'o', 's', 'v', '^', '>', '<', ] +markers = ['s', 'o', 'v', '^', '+', 'x', '>', '<', ] +markersize = 8 + + +def get_values(filename): + sizes = [] + values = [] + for line in Path(filename).read_text().splitlines(): + if line.startswith('Processing database:'): + txtime = 0 + line = line.split(':')[1] + # Check if entry is compressed and if has to be processed + line = line[:line.rfind('.')] + params = line.split('-') + for param in params: + if param[-1] in ('k', 'm', 'g'): + size = param + isize = int(size[:-1]) * 1000 + if size[-1] == "m": + isize *= 1000 + elif size[-1] == "g": + isize *= 1000 * 1000 + elif insert and line.startswith('Insert time'): + tmp = line.split(':')[1] + itime = float(tmp) + sizes.append(isize) + values.append(itime) + elif (overlaps or entropy) and line.startswith('overlaps'): + tmp = line.split(':')[1] + e1, e2 = tmp.split() + if isize in sizes: + sizes.pop() + values.pop() + sizes.append(isize) + if overlaps: + values.append(int(e1) + 1) + else: + values.append(float(e2) + 1) + elif (create_total or create_index) and line.startswith('Index time'): + tmp = line.split(':')[1] + xtime = float(tmp) + txtime += xtime + if create_index and create_index in line: + sizes.append(isize) + values.append(xtime) + elif create_total and txtime > xtime: + sizes.append(isize) + values.append(txtime) + elif table_size and line.startswith('Table size'): + tsize = float(line.split(':')[1]) + sizes.append(isize) + values.append(tsize) + elif indexes_size and line.startswith('Indexes size'): + xsize = float(line.split(':')[1]) + sizes.append(isize) + values.append(xsize) + elif total_size and line.startswith('Full size'): + fsize = float(line.split(':')[1]) + sizes.append(isize) + values.append(fsize) + elif query and line.startswith('Query time'): + tmp = line.split(':')[1] + qtime = float(tmp) + if colname in line: + sizes.append(isize) + values.append(qtime) + elif ((query or query_cold or query_warm) and + line.startswith('[NOREP]')): + tmp = line.split(':')[1] + try: + qtime = float(tmp[:tmp.index('+-')]) + except ValueError: + qtime = float(tmp) + if colname in line: + if query and '1st' in line: + sizes.append(isize) + values.append(qtime) + elif query_cold and 'cold' in line: + sizes.append(isize) + values.append(qtime) + elif query_warm and 'warm' in line: + sizes.append(isize) + values.append(qtime) + elif query_repeated and line.startswith('[REP]'): + if colname in line and 'warm' in line: + tmp = line.split(':')[1] + qtime = float(tmp[:tmp.index('+-')]) + sizes.append(isize) + values.append(qtime) + return sizes, values + + +def show_plot(plots, yaxis, legends, gtitle): + xlabel('Number of rows') + ylabel(yaxis) + title(gtitle) + #xlim(10**3, 10**9) + xlim(10 ** 3, 10 ** 10) + # ylim(1.0e-5) + #ylim(-1e4, 1e5) + #ylim(-1e3, 1e4) + #ylim(-1e2, 1e3) + grid(True) + +# legends = [f[f.find('-'):f.index('.out')] for f in filenames] +# legends = [l.replace('-', ' ') for l in legends] + legend([p[0] for p in plots], legends, loc="upper left") + #legend([p[0] for p in plots], legends, loc = "center left") + + #subplots_adjust(bottom=0.2, top=None, wspace=0.2, hspace=0.2) + if outfile: + savefig(outfile) + else: + show() + +if __name__ == '__main__': + + import sys + import getopt + + usage = """usage: %s [-o file] [-t title] [--insert] [--create-index] [--create-total] [--overlaps] [--entropy] [--table-size] [--indexes-size] [--total-size] [--query=colname] [--query-cold=colname] [--query-warm=colname] [--query-repeated=colname] files + -o filename for output (only .png and .jpg extensions supported) + -t title of the plot + --insert -- Insert time for table + --create-index=colname -- Index time for column + --create-total -- Total time for creation of table + indexes + --overlaps -- The overlapping for the created index + --entropy -- The entropy for the created index + --table-size -- Size of table + --indexes-size -- Size of all indexes + --total-size -- Total size of table + indexes + --query=colname -- Time for querying the specified column + --query-cold=colname -- Time for querying the specified column (cold cache) + --query-warm=colname -- Time for querying the specified column (warm cache) + --query-repeated=colname -- Time for querying the specified column (rep query) + \n""" % sys.argv[0] + + try: + opts, pargs = getopt.getopt(sys.argv[1:], 'o:t:', + ['insert', + 'create-index=', + 'create-total', + 'overlaps', + 'entropy', + 'table-size', + 'indexes-size', + 'total-size', + 'query=', + 'query-cold=', + 'query-warm=', + 'query-repeated=', + ]) + except: + sys.stderr.write(usage) + sys.exit(0) + + progname = sys.argv[0] + args = sys.argv[1:] + + # if we pass too few parameters, abort + if len(pargs) < 1: + sys.stderr.write(usage) + sys.exit(0) + + # default options + outfile = None + insert = 0 + create_index = None + create_total = 0 + overlaps = 0 + entropy = 0 + table_size = 0 + indexes_size = 0 + total_size = 0 + query = 0 + query_cold = 0 + query_warm = 0 + query_repeated = 0 + colname = None + yaxis = "No axis name" + tit = None + gtitle = "Please set a title!" + + # Get the options + for option in opts: + if option[0] == '-o': + outfile = option[1] + elif option[0] == '-t': + tit = option[1] + elif option[0] == '--insert': + insert = 1 + yaxis = "Time (s)" + gtitle = "Insert time for table" + elif option[0] == '--create-index': + create_index = option[1] + yaxis = "Time (s)" + gtitle = "Create index time for " + create_index + " column" + elif option[0] == '--create-total': + create_total = 1 + yaxis = "Time (s)" + gtitle = "Create time for table + indexes" + elif option[0] == '--overlaps': + overlaps = 1 + yaxis = "Overlapping index + 1" + gtitle = "Overlapping for col4 column" + elif option[0] == '--entropy': + entropy = 1 + yaxis = "Entropy + 1" + gtitle = "Entropy for col4 column" + elif option[0] == '--table-size': + table_size = 1 + yaxis = "Size (MB)" + gtitle = "Table size" + elif option[0] == '--indexes-size': + indexes_size = 1 + yaxis = "Size (MB)" + #gtitle = "Indexes size" + gtitle = "Index size for col4 column" + elif option[0] == '--total-size': + total_size = 1 + yaxis = "Size (MB)" + gtitle = "Total size (table + indexes)" + elif option[0] == '--query': + query = 1 + colname = option[1] + yaxis = "Time (s)" + gtitle = "Query time for " + colname + " column (first query)" + elif option[0] == '--query-cold': + query_cold = 1 + colname = option[1] + yaxis = "Time (s)" + gtitle = "Query time for " + colname + " column (cold cache)" + elif option[0] == '--query-warm': + query_warm = 1 + colname = option[1] + yaxis = "Time (s)" + gtitle = "Query time for " + colname + " column (warm cache)" + elif option[0] == '--query-repeated': + query_repeated = 1 + colname = option[1] + yaxis = "Time (s)" + gtitle = "Query time for " + colname + " column (repeated query)" + + gtitle = gtitle.replace('col2', 'Int32') + gtitle = gtitle.replace('col4', 'Float64') + + filenames = pargs + + if tit: + gtitle = tit + + plots = [] + legends = [] + for i, filename in enumerate(filenames): + plegend = filename[:filename.index('.out')] + plegend = plegend.replace('-', ' ') + #plegend = plegend.replace('zlib1', '') + if filename.find('PyTables') != -1: + xval, yval = get_values(filename) + print(f"Values for {filename} --> {xval}, {yval}") + if xval != []: + plot = loglog(xval, yval) + #plot = semilogx(xval, yval) + setp(plot, marker=markers[i], markersize=markersize, + linewidth=linewidth) + plots.append(plot) + legends.append(plegend) + else: + xval, yval = get_values(filename) + print(f"Values for {filename} --> {xval}, {yval}") + plots.append(loglog(xval, yval, linewidth=3, color='m')) + #plots.append(semilogx(xval, yval, linewidth=linewidth, color='m')) + legends.append(plegend) + if 0: # Per a introduir dades simulades si es vol... + xval = [1000, 10_000, 100_000, 1_000_000, 10_000_000, + 100_000_000, 1_000_000_000] +# yval = [0.003, 0.005, 0.02, 0.06, 1.2, +# 40, 210] + yval = [0.0009, 0.0011, 0.0022, 0.005, 0.02, + 0.2, 5.6] + plots.append(loglog(xval, yval, linewidth=linewidth)) + legends.append("PyTables Std") + show_plot(plots, yaxis, legends, gtitle) diff --git a/bench/indexed_search.py b/bench/indexed_search.py new file mode 100644 index 0000000..0109cc0 --- /dev/null +++ b/bench/indexed_search.py @@ -0,0 +1,457 @@ +import random +import subprocess +from pathlib import Path +from time import perf_counter as clock + +import numpy as np + +# Constants + +STEP = 1000 * 100 # the size of the buffer to fill the table, in rows +SCALE = 0.1 # standard deviation of the noise compared with actual + # values +NI_NTIMES = 1 # The number of queries for doing a mean (non-idx cols) +# COLDCACHE = 10 # The number of reads where the cache is considered 'cold' +# WARMCACHE = 50 # The number of reads until the cache is considered 'warmed' +# READ_TIMES = WARMCACHE+50 # The number of complete calls to DB.query_db() +# COLDCACHE = 50 # The number of reads where the cache is considered 'cold' +# WARMCACHE = 50 # The number of reads until the cache is considered 'warmed' +# READ_TIMES = WARMCACHE+50 # The number of complete calls to DB.query_db() +MROW = 1000 * 1000 + +# Test values +COLDCACHE = 5 # The number of reads where the cache is considered 'cold' +WARMCACHE = 5 # The number of reads until the cache is considered 'warmed' +READ_TIMES = 10 # The number of complete calls to DB.query_db() + +# global variables +rdm_cod = ['lin', 'rnd'] +prec = 6 # precision for printing floats purposes + + +def get_nrows(nrows_str): + powers = {'k': 3, 'm': 6, 'g': 9} + try: + return int(float(nrows_str[:-1]) * 10 ** powers[nrows_str[-1]]) + except KeyError: + raise ValueError( + "value of nrows must end with either 'k', 'm' or 'g' suffixes.") + + +class DB: + + def __init__(self, nrows, rng, userandom): + global step, scale + self.step = STEP + self.scale = SCALE + self.rng = rng + self.userandom = userandom + self.filename = '-'.join([rdm_cod[userandom], nrows]) + self.nrows = get_nrows(nrows) + + def get_db_size(self): + sout = subprocess.Popen("sync;du -s %s" % self.filename, shell=True, + stdout=subprocess.PIPE).stdout + line = [l for l in sout][0] + return int(line.split()[0]) + + def print_mtime(self, t1, explain): + mtime = clock() - t1 + print(f"{explain}: {mtime:.6f}") + print(f"Krows/s: {self.nrows / 1000 / mtime:.6f}") + + def print_qtime(self, colname, ltimes): + qtime1 = ltimes[0] # First measured time + qtime2 = ltimes[-1] # Last measured time + print(f"Query time for {colname}: {qtime1:.6f}") + print(f"Mrows/s: {self.nrows / MROW / qtime1:.6f}") + print(f"Query time for {colname} (cached): {qtime2:.6f}") + print(f"Mrows/s (cached): {self.nrows / MROW / qtime2:.6f}") + + def norm_times(self, ltimes): + "Get the mean and stddev of ltimes, avoiding the extreme values." + lmean = ltimes.mean() + lstd = ltimes.std() + ntimes = ltimes[ltimes < lmean + lstd] + nmean = ntimes.mean() + nstd = ntimes.std() + return nmean, nstd + + def print_qtime_idx(self, colname, ltimes, repeated, verbose): + if repeated: + r = "[REP] " + else: + r = "[NOREP] " + ltimes = np.array(ltimes) + ntimes = len(ltimes) + qtime1 = ltimes[0] # First measured time + ctimes = ltimes[1:COLDCACHE] + cmean, cstd = self.norm_times(ctimes) + wtimes = ltimes[WARMCACHE:] + wmean, wstd = self.norm_times(wtimes) + if verbose: + print("Times for cold cache:\n", ctimes) + # print "Times for warm cache:\n", wtimes + hist1, hist2 = np.histogram(wtimes) + print(f"Histogram for warm cache: {hist1}\n{hist2}") + print(f"{r}1st query time for {colname}: {qtime1:.{prec}f}") + print(f"{r}Query time for {colname} (cold cache): " + f"{cmean:.{prec}f} +- {cstd:.{prec}f}") + print(f"{r}Query time for {colname} (warm cache): " + f"{wmean:.{prec}f} +- {wstd:.{prec}f}") + + def print_db_sizes(self, init, filled, indexed): + table_size = (filled - init) / 1024 + indexes_size = (indexed - filled) / 1024 + print(f"Table size (MB): {table_size:.3f}") + print(f"Indexes size (MB): {indexes_size:.3f}") + print(f"Full size (MB): {table_size + indexes_size:.3f}") + + def fill_arrays(self, start, stop): + arr_f8 = np.arange(start, stop, dtype='float64') + arr_i4 = np.arange(start, stop, dtype='int32') + if self.userandom: + arr_f8 += np.random.normal(0, stop * self.scale, size=stop - start) + arr_i4 = np.array(arr_f8, dtype='int32') + return arr_i4, arr_f8 + + def create_db(self, dtype, kind, optlevel, verbose): + self.con = self.open_db(remove=1) + self.create_table(self.con) + init_size = self.get_db_size() + t1 = clock() + self.fill_table(self.con) + table_size = self.get_db_size() + self.print_mtime(t1, 'Insert time') + self.index_db(dtype, kind, optlevel, verbose) + indexes_size = self.get_db_size() + self.print_db_sizes(init_size, table_size, indexes_size) + self.close_db(self.con) + + def index_db(self, dtype, kind, optlevel, verbose): + if dtype == "int": + idx_cols = ['col2'] + elif dtype == "float": + idx_cols = ['col4'] + else: + idx_cols = ['col2', 'col4'] + for colname in idx_cols: + t1 = clock() + self.index_col(self.con, colname, kind, optlevel, verbose) + self.print_mtime(t1, 'Index time (%s)' % colname) + + def query_db(self, niter, dtype, onlyidxquery, onlynonidxquery, + avoidfscache, verbose, inkernel): + self.con = self.open_db() + if dtype == "int": + reg_cols = ['col1'] + idx_cols = ['col2'] + elif dtype == "float": + reg_cols = ['col3'] + idx_cols = ['col4'] + else: + reg_cols = ['col1', 'col3'] + idx_cols = ['col2', 'col4'] + if avoidfscache: + rseed = int(np.random.randint(self.nrows)) + else: + rseed = 19 + # Query for non-indexed columns + np.random.seed(rseed) + base = np.random.randint(self.nrows) + if not onlyidxquery: + for colname in reg_cols: + ltimes = [] + random.seed(rseed) + for i in range(NI_NTIMES): + t1 = clock() + results = self.do_query(self.con, colname, base, inkernel) + ltimes.append(clock() - t1) + if verbose: + print("Results len:", results) + self.print_qtime(colname, ltimes) + # Always reopen the file after *every* query loop. + # Necessary to make the benchmark to run correctly. + self.close_db(self.con) + self.con = self.open_db() + # Query for indexed columns + if not onlynonidxquery: + for colname in idx_cols: + ltimes = [] + np.random.seed(rseed) + rndbase = np.random.randint(self.nrows, size=niter) + # First, non-repeated queries + for i in range(niter): + base = rndbase[i] + t1 = clock() + results = self.do_query(self.con, colname, base, inkernel) + #results, tprof = self.do_query( + # self.con, colname, base, inkernel) + ltimes.append(clock() - t1) + if verbose: + print("Results len:", results) + self.print_qtime_idx(colname, ltimes, False, verbose) + # Always reopen the file after *every* query loop. + # Necessary to make the benchmark to run correctly. + self.close_db(self.con) + self.con = self.open_db() + ltimes = [] +# Second, repeated queries +# for i in range(niter): +# t1=time() +# results = self.do_query( +# self.con, colname, base, inkernel) +# results, tprof = self.do_query(self.con, colname, base, inkernel) +# ltimes.append(time()-t1) +# if verbose: +# print "Results len:", results +# self.print_qtime_idx(colname, ltimes, True, verbose) + # Print internal PyTables index tprof statistics + #tprof = numpy.array(tprof) + #tmean, tstd = self.norm_times(tprof) + # print "tprof-->", round(tmean, prec), "+-", round(tstd, prec) + # print "tprof hist-->", \ + # numpy.histogram(tprof) + # print "tprof raw-->", tprof + # Always reopen the file after *every* query loop. + # Necessary to make the benchmark to run correctly. + self.close_db(self.con) + self.con = self.open_db() + # Finally, close the file. + self.close_db(self.con) + + def close_db(self, con): + con.close() + + +if __name__ == "__main__": + import sys + import getopt + + try: + import psyco + psyco_imported = 1 + except: + psyco_imported = 0 + + usage = """usage: %s [-T] [-P] [-v] [-f] [-k] [-p] [-m] [-c] [-q] [-i] [-I] [-S] [-x] [-z complevel] [-l complib] [-R range] [-N niter] [-n nrows] [-d datadir] [-O level] [-t kind] [-s] col -Q [suplim] + -T use Pytables + -P use Postgres + -v verbose + -f do a profile of the run (only query functionality) + -k do a profile for kcachegrind use (out file is 'indexed_search.kcg') + -p use "psyco" if available + -m use random values to fill the table + -q do a query (both indexed and non-indexed versions) + -i do a query (just indexed one) + -I do a query (just in-kernel one) + -S do a query (just standard one) + -x choose a different seed for random numbers (i.e. avoid FS cache) + -c create the database + -z compress with zlib (no compression by default) + -l use complib for compression (zlib used by default) + -R select a range in a field in the form "start,stop" (def "0,10") + -N number of iterations for reading + -n sets the number of rows (in krows) in each table + -d directory to save data (default: data.nobackup) + -O set the optimization level for PyTables indexes + -t select the index type: "medium" (default) or "full", "light", "ultralight" + -s select a type column for operations ('int' or 'float'. def all) + -Q do a repeteated query up to 10**value + \n""" % sys.argv[0] + + try: + opts, pargs = getopt.getopt( + sys.argv[1:], 'TPvfkpmcqiISxz:l:R:N:n:d:O:t:s:Q:') + except: + sys.stderr.write(usage) + sys.exit(1) + + # default options + usepytables = 0 + usepostgres = 0 + verbose = 0 + doprofile = 0 + dokprofile = 0 + usepsyco = 0 + userandom = 0 + docreate = 0 + optlevel = 0 + kind = "medium" + docompress = 0 + complib = "zlib" + doquery = False + onlyidxquery = False + onlynonidxquery = False + inkernel = True + avoidfscache = 0 + #rng = [-10, 10] + rng = [-1000, -1000] + repeatquery = 0 + repeatvalue = 0 + krows = '1k' + niter = READ_TIMES + dtype = "all" + datadir = "data.nobackup" + + # Get the options + for option in opts: + if option[0] == '-T': + usepytables = 1 + elif option[0] == '-P': + usepostgres = 1 + elif option[0] == '-v': + verbose = 1 + elif option[0] == '-f': + doprofile = 1 + elif option[0] == '-k': + dokprofile = 1 + elif option[0] == '-p': + usepsyco = 1 + elif option[0] == '-m': + userandom = 1 + elif option[0] == '-c': + docreate = 1 + elif option[0] == '-q': + doquery = True + elif option[0] == '-i': + doquery = True + onlyidxquery = True + elif option[0] == '-I': + doquery = True + onlynonidxquery = True + elif option[0] == '-S': + doquery = True + onlynonidxquery = True + inkernel = False + elif option[0] == '-x': + avoidfscache = 1 + elif option[0] == '-z': + docompress = int(option[1]) + elif option[0] == '-l': + complib = option[1] + elif option[0] == '-R': + rng = [int(i) for i in option[1].split(",")] + elif option[0] == '-N': + niter = int(option[1]) + elif option[0] == '-n': + krows = option[1] + elif option[0] == '-d': + datadir = option[1] + elif option[0] == '-O': + optlevel = int(option[1]) + elif option[0] == '-t': + if option[1] in ('full', 'medium', 'light', 'ultralight'): + kind = option[1] + else: + print("kind should be either 'full', 'medium', 'light' or " + "'ultralight'") + sys.exit(1) + elif option[0] == '-s': + if option[1] in ('int', 'float'): + dtype = option[1] + else: + print("column should be either 'int' or 'float'") + sys.exit(1) + elif option[0] == '-Q': + repeatquery = 1 + repeatvalue = int(option[1]) + + # If not database backend selected, abort + if not usepytables and not usepostgres: + print("Please select a backend:") + print("PyTables: -T") + print("Postgres: -P") + sys.exit(1) + + # Create the class for the database + if usepytables: + from pytables_backend import PyTables_DB + db = PyTables_DB(krows, rng, userandom, datadir, + docompress, complib, kind, optlevel) + elif usepostgres: + from postgres_backend import Postgres_DB + db = Postgres_DB(krows, rng, userandom) + + if not avoidfscache: + # in order to always generate the same random sequence + np.random.seed(20) + + if verbose: + if userandom: + print("using random values") + if onlyidxquery: + print("doing indexed queries only") + + if psyco_imported and usepsyco: + psyco.bind(db.create_db) + psyco.bind(db.query_db) + + if docreate: + if verbose: + print("writing %s rows" % krows) + db.create_db(dtype, kind, optlevel, verbose) + + if doquery: + print("Calling query_db() %s times" % niter) + if doprofile: + import pstats + import cProfile as prof + prof.run( + 'db.query_db(niter, dtype, onlyidxquery, onlynonidxquery, ' + 'avoidfscache, verbose, inkernel)', + 'indexed_search.prof') + stats = pstats.Stats('indexed_search.prof') + stats.strip_dirs() + stats.sort_stats('time', 'calls') + if verbose: + stats.print_stats() + else: + stats.print_stats(20) + elif dokprofile: + from cProfile import Profile + import lsprofcalltree + prof = Profile() + prof.run( + 'db.query_db(niter, dtype, onlyidxquery, onlynonidxquery, ' + 'avoidfscache, verbose, inkernel)') + kcg = lsprofcalltree.KCacheGrind(prof) + with Path('indexed_search.kcg').open('w') as ofile: + kcg.output(ofile) + elif doprofile: + import hotshot + import hotshot.stats + prof = hotshot.Profile("indexed_search.prof") + benchtime, stones = prof.run( + 'db.query_db(niter, dtype, onlyidxquery, onlynonidxquery, ' + 'avoidfscache, verbose, inkernel)') + prof.close() + stats = hotshot.stats.load("indexed_search.prof") + stats.strip_dirs() + stats.sort_stats('time', 'calls') + stats.print_stats(20) + else: + db.query_db(niter, dtype, onlyidxquery, onlynonidxquery, + avoidfscache, verbose, inkernel) + + if repeatquery: + # Start by a range which is almost None + db.rng = [1, 1] + if verbose: + print("range:", db.rng) + db.query_db(niter, dtype, onlyidxquery, onlynonidxquery, + avoidfscache, verbose, inkernel) + for i in range(repeatvalue): + for j in (1, 2, 5): + rng = j * 10 ** i + db.rng = [-rng / 2, rng / 2] + if verbose: + print("range:", db.rng) +# if usepostgres: +# os.system( +# "echo 1 > /proc/sys/vm/drop_caches;" +# " /etc/init.d/postgresql restart") +# else: +# os.system("echo 1 > /proc/sys/vm/drop_caches") + db.query_db(niter, dtype, onlyidxquery, onlynonidxquery, + avoidfscache, verbose, inkernel) diff --git a/bench/keysort.py b/bench/keysort.py new file mode 100644 index 0000000..30db7ef --- /dev/null +++ b/bench/keysort.py @@ -0,0 +1,33 @@ +from time import perf_counter as clock + +import numpy as np +import tables as tb + +N = 1000 * 1000 +rnd = np.random.randint(N, size=N) + +for dtype1 in ('S6', 'b1', + 'i1', 'i2', 'i4', 'i8', + 'u1', 'u2', 'u4', 'u8', 'f4', 'f8'): + for dtype2 in ('u4', 'i8'): + print("dtype array1, array2-->", dtype1, dtype2) + a = np.array(rnd, dtype1) + b = np.arange(N, dtype=dtype2) + c = a.copy() + + t1 = clock() + d = c.argsort() + # c.sort() + # e=c + e = c[d] + f = b[d] + tref = clock() - t1 + print("normal sort time-->", tref) + + t1 = clock() + tb.indexesextension.keysort(a, b) + tks = clock() - t1 + print("keysort time-->", tks, " {:.2f}x".format(tref / tks)) + assert np.alltrue(a == e) + #assert numpy.alltrue(b == d) + assert np.alltrue(f == d) diff --git a/bench/lookup_bench.py b/bench/lookup_bench.py new file mode 100644 index 0000000..bb94c92 --- /dev/null +++ b/bench/lookup_bench.py @@ -0,0 +1,238 @@ +"""Benchmark to help choosing the best chunksize so as to optimize the access +time in random lookups.""" + +import subprocess +from pathlib import Path +from time import perf_counter as clock + +import numpy as np +import tables as tb + +# Constants +NOISE = 1e-15 # standard deviation of the noise compared with actual values + +rdm_cod = ['lin', 'rnd'] + + +def get_nrows(nrows_str): + powers = {'k': 3, 'm': 6, 'g': 9} + try: + return int(float(nrows_str[:-1]) * 10 ** powers[nrows_str[-1]]) + except KeyError: + raise ValueError( + "value of nrows must end with either 'k', 'm' or 'g' suffixes.") + + +class DB: + + def __init__(self, nrows, dtype, chunksize, userandom, datadir, + docompress=0, complib='zlib'): + self.dtype = dtype + self.docompress = docompress + self.complib = complib + self.filename = '-'.join([rdm_cod[userandom], + "n" + nrows, "s" + chunksize, dtype]) + # Complete the filename + self.filename = "lookup-" + self.filename + if docompress: + self.filename += '-' + complib + str(docompress) + self.filename = datadir + '/' + self.filename + '.h5' + print("Processing database:", self.filename) + self.userandom = userandom + self.nrows = get_nrows(nrows) + self.chunksize = get_nrows(chunksize) + self.step = self.chunksize + self.scale = NOISE + + def get_db_size(self): + sout = subprocess.Popen("sync;du -s %s" % self.filename, shell=True, + stdout=subprocess.PIPE).stdout + line = [l for l in sout][0] + return int(line.split()[0]) + + def print_mtime(self, t1, explain): + mtime = clock() - t1 + print(f"{explain}: {mtime:.6f}") + print(f"Krows/s: {self.nrows / 1000 / mtime:.6f}") + + def print_db_sizes(self, init, filled): + array_size = (filled - init) / 1024 + print(f"Array size (MB): {array_size:.3f}") + + def open_db(self, remove=0): + if remove and Path(self.filename).is_file(): + Path(self.filename).unlink() + con = tb.open_file(self.filename, 'a') + return con + + def create_db(self, verbose): + self.con = self.open_db(remove=1) + self.create_array() + init_size = self.get_db_size() + t1 = clock() + self.fill_array() + array_size = self.get_db_size() + self.print_mtime(t1, 'Insert time') + self.print_db_sizes(init_size, array_size) + self.close_db() + + def create_array(self): + # The filters chosen + filters = tb.Filters(complevel=self.docompress, + complib=self.complib) + atom = tb.Atom.from_kind(self.dtype) + self.con.create_earray(self.con.root, 'earray', atom, (0,), + filters=filters, + expectedrows=self.nrows, + chunkshape=(self.chunksize,)) + + def fill_array(self): + "Fills the array" + earray = self.con.root.earray + j = 0 + arr = self.get_array(0, self.step) + for i in range(0, self.nrows, self.step): + stop = (j + 1) * self.step + if stop > self.nrows: + stop = self.nrows + ###arr = self.get_array(i, stop, dtype) + earray.append(arr) + j += 1 + earray.flush() + + def get_array(self, start, stop): + arr = np.arange(start, stop, dtype='float') + if self.userandom: + arr += np.random.normal(0, stop * self.scale, size=stop - start) + arr = arr.astype(self.dtype) + return arr + + def print_qtime(self, ltimes): + ltimes = np.array(ltimes) + print("Raw query times:\n", ltimes) + print("Histogram times:\n", np.histogram(ltimes[1:])) + ntimes = len(ltimes) + qtime1 = ltimes[0] # First measured time + if ntimes > 5: + # Wait until the 5th iteration (in order to + # ensure that the index is effectively cached) to take times + qtime2 = sum(ltimes[5:]) / (ntimes - 5) + else: + qtime2 = ltimes[-1] # Last measured time + print(f"1st query time: {qtime1:.3f}") + print(f"Mean (skipping the first 5 meas.): {qtime2:.3f}") + + def query_db(self, niter, avoidfscache, verbose): + self.con = self.open_db() + earray = self.con.root.earray + if avoidfscache: + rseed = int(np.random.randint(self.nrows)) + else: + rseed = 19 + np.random.seed(rseed) + np.random.randint(self.nrows) + ltimes = [] + for i in range(niter): + t1 = clock() + self.do_query(earray, np.random.randint(self.nrows)) + ltimes.append(clock() - t1) + self.print_qtime(ltimes) + self.close_db() + + def do_query(self, earray, idx): + return earray[idx] + + def close_db(self): + self.con.close() + + +if __name__ == "__main__": + import sys + import getopt + + usage = """usage: %s [-v] [-m] [-c] [-q] [-x] [-z complevel] [-l complib] [-N niter] [-n nrows] [-d datadir] [-t] type [-s] chunksize + -v verbose + -m use random values to fill the array + -q do a (random) lookup + -x choose a different seed for random numbers (i.e. avoid FS cache) + -c create the file + -z compress with zlib (no compression by default) + -l use complib for compression (zlib used by default) + -N number of iterations for reading + -n sets the number of rows in the array + -d directory to save data (default: data.nobackup) + -t select the type for array ('int' or 'float'. def 'float') + -s select the chunksize for array + \n""" % sys.argv[0] + + try: + opts, pargs = getopt.getopt(sys.argv[1:], 'vmcqxz:l:N:n:d:t:s:') + except: + sys.stderr.write(usage) + sys.exit(0) + + # default options + verbose = 0 + userandom = 0 + docreate = 0 + optlevel = 0 + docompress = 0 + complib = "zlib" + doquery = False + avoidfscache = 0 + krows = '1k' + chunksize = '32k' + niter = 50 + datadir = "data.nobackup" + dtype = "float" + + # Get the options + for option in opts: + if option[0] == '-v': + verbose = 1 + elif option[0] == '-m': + userandom = 1 + elif option[0] == '-c': + docreate = 1 + createindex = 1 + elif option[0] == '-q': + doquery = True + elif option[0] == '-x': + avoidfscache = 1 + elif option[0] == '-z': + docompress = int(option[1]) + elif option[0] == '-l': + complib = option[1] + elif option[0] == '-N': + niter = int(option[1]) + elif option[0] == '-n': + krows = option[1] + elif option[0] == '-d': + datadir = option[1] + elif option[0] == '-t': + if option[1] in ('int', 'float'): + dtype = option[1] + else: + print("type should be either 'int' or 'float'") + sys.exit(0) + elif option[0] == '-s': + chunksize = option[1] + + if not avoidfscache: + # in order to always generate the same random sequence + np.random.seed(20) + + if verbose: + if userandom: + print("using random values") + + db = DB(krows, dtype, chunksize, userandom, datadir, docompress, complib) + + if docreate: + if verbose: + print("writing %s rows" % krows) + db.create_db(verbose) + + if doquery: + print("Calling query_db() %s times" % niter) + db.query_db(niter, avoidfscache, verbose) diff --git a/bench/open_close-bench.py b/bench/open_close-bench.py new file mode 100644 index 0000000..488d85b --- /dev/null +++ b/bench/open_close-bench.py @@ -0,0 +1,232 @@ +"""Testbed for open/close PyTables files. + +This uses the HotShot profiler. + +""" + +import os +import sys +import getopt +import pstats +import cProfile as prof +from pathlib import Path +from time import perf_counter as clock + +import tables as tb + +filename = None +niter = 1 + + +def show_stats(explain, tref): + "Show the used memory" + for line in Path('/proc/self/status').read_text().splitlines(): + if line.startswith("VmSize:"): + vmsize = int(line.split()[1]) + elif line.startswith("VmRSS:"): + vmrss = int(line.split()[1]) + elif line.startswith("VmData:"): + vmdata = int(line.split()[1]) + elif line.startswith("VmStk:"): + vmstk = int(line.split()[1]) + elif line.startswith("VmExe:"): + vmexe = int(line.split()[1]) + elif line.startswith("VmLib:"): + vmlib = int(line.split()[1]) + print("WallClock time:", clock() - tref) + print("Memory usage: ******* %s *******" % explain) + print(f"VmSize: {vmsize:>7} kB\tVmRSS: {vmrss:>7} kB") + print(f"VmData: {vmdata:>7} kB\tVmStk: {vmstk:>7} kB") + print(f"VmExe: {vmexe:>7} kB\tVmLib: {vmlib:>7} kB") + + +def check_open_close(): + for i in range(niter): + print( + "------------------ open_close #%s -------------------------" % i) + tref = clock() + fileh = tb.open_file(filename) + fileh.close() + show_stats("After closing file", tref) + + +def check_only_open(): + for i in range(niter): + print("------------------ only_open #%s -------------------------" % i) + tref = clock() + fileh = tb.open_file(filename) + show_stats("Before closing file", tref) + fileh.close() + + +def check_full_browse(): + for i in range(niter): + print("------------------ full_browse #%s -----------------------" % i) + tref = clock() + fileh = tb.open_file(filename) + for node in fileh: + pass + fileh.close() + show_stats("After full browse", tref) + + +def check_partial_browse(): + for i in range(niter): + print("------------------ partial_browse #%s --------------------" % i) + tref = clock() + fileh = tb.open_file(filename) + for node in fileh.root.ngroup0.ngroup1: + pass + fileh.close() + show_stats("After closing file", tref) + + +def check_full_browse_attrs(): + for i in range(niter): + print("------------------ full_browse_attrs #%s -----------------" % i) + tref = clock() + fileh = tb.open_file(filename) + for node in fileh: + # Access to an attribute + klass = node._v_attrs.CLASS + fileh.close() + show_stats("After full browse", tref) + + +def check_partial_browse_attrs(): + for i in range(niter): + print("------------------ partial_browse_attrs #%s --------------" % i) + tref = clock() + fileh = tb.open_file(filename) + for node in fileh.root.ngroup0.ngroup1: + # Access to an attribute + klass = node._v_attrs.CLASS + fileh.close() + show_stats("After closing file", tref) + + +def check_open_group(): + for i in range(niter): + print("------------------ open_group #%s ------------------------" % i) + tref = clock() + fileh = tb.open_file(filename) + group = fileh.root.ngroup0.ngroup1 + # Access to an attribute + klass = group._v_attrs.CLASS + fileh.close() + show_stats("After closing file", tref) + + +def check_open_leaf(): + for i in range(niter): + print("------------------ open_leaf #%s -----------------------" % i) + tref = clock() + fileh = tb.open_file(filename) + leaf = fileh.root.ngroup0.ngroup1.array9 + # Access to an attribute + klass = leaf._v_attrs.CLASS + fileh.close() + show_stats("After closing file", tref) + + +if __name__ == '__main__': + + usage = """usage: %s [-v] [-p] [-n niter] [-O] [-o] [-B] [-b] [-g] [-l] [-A] [-a] [-E] [-S] datafile + -v verbose (total dump of profiling) + -p do profiling + -n number of iterations for reading + -O Check open_close + -o Check only_open + -B Check full browse + -b Check partial browse + -A Check full browse and reading one attr each node + -a Check partial browse and reading one attr each node + -g Check open nested group + -l Check open nested leaf + -E Check everything + -S Check everything as subprocess + \n""" % sys.argv[0] + + try: + opts, pargs = getopt.getopt(sys.argv[1:], 'vpn:OoBbAaglESs') + except: + sys.stderr.write(usage) + sys.exit(0) + + progname = sys.argv[0] + args = sys.argv[1:] + + # if we pass too much parameters, abort + if len(pargs) != 1: + sys.stderr.write(usage) + sys.exit(0) + + # default options + verbose = 0 + silent = 0 # if silent, does not print the final statistics + profile = 0 + all_checks = 0 + all_system_checks = 0 + func = [] + + # Checking options + options = ['-O', '-o', '-B', '-b', '-A', '-a', '-g', '-l'] + + # Dict to map options to checking functions + option2func = { + '-O': 'check_open_close', + '-o': 'check_only_open', + '-B': 'check_full_browse', + '-b': 'check_partial_browse', + '-A': 'check_full_browse_attrs', + '-a': 'check_partial_browse_attrs', + '-g': 'check_open_group', + '-l': 'check_open_leaf', + } + + # Get the options + for option in opts: + if option[0] == '-v': + verbose = 1 + elif option[0] == '-p': + profile = 1 + elif option[0] in option2func: + func.append(option2func[option[0]]) + elif option[0] == '-E': + all_checks = 1 + for opt in options: + func.append(option2func[opt]) + elif option[0] == '-S': + all_system_checks = 1 + elif option[0] == '-s': + silent = 1 + elif option[0] == '-n': + niter = int(option[1]) + + filename = pargs[0] + + tref = clock() + if all_system_checks: + args.remove('-S') # We don't want -S in the options list again + for opt in options: + opts = r"{} \-s {} {}".format(progname, opt, " ".join(args)) + # print "opts-->", opts + os.system("python2.4 %s" % opts) + else: + if profile: + for ifunc in func: + prof.run(ifunc + '()', ifunc + '.prof') + stats = pstats.Stats(ifunc + '.prof') + stats.strip_dirs() + stats.sort_stats('time', 'calls') + if verbose: + stats.print_stats() + else: + stats.print_stats(20) + else: + for ifunc in func: + eval(ifunc + '()') + + if not silent: + print("------------------ End of run -------------------------") + show_stats("Final statistics (after closing everything)", tref) diff --git a/bench/opteron-stress-test.txt b/bench/opteron-stress-test.txt new file mode 100644 index 0000000..33dc599 --- /dev/null +++ b/bench/opteron-stress-test.txt @@ -0,0 +1,63 @@ +Stress test on a 64 bits AMD Opteron platform +============================================= +2004-02-04. F. Alted + +Platform description: + +4 processors AMD Opteron (64-bits) @ 1.6 GHz and 1 MB cache +8 GB RAM +HD IBM DeskStar 120GXP 80 GB ATA/100 2 MB cache @ 7200 rpm +SuSe Linux Enterprise Server (SLES) +Linux kernel 2.4.21-178-smp +ReiserFS filesystem + +Here's the command to do the stress test: + +time python /tmp/stress-test3.py -l zlib -c 6 -g400 -t 300 -i 20000 /tmp/test-big-zlib-6.h5 +ls -lh /tmp/test-big-zlib-6.h5 + +The output: + +Compression level: 6 +Compression library: zlib +Rows written: 2400000000 Row size: 512 +Time writing rows: 56173.557 s (real) 56154.84 s (cpu) 100% +Write rows/sec: 42724 +Write KB/s : 21362 +Rows read: 2400000000 Row size: 512 Buf size: 39936 +Time reading rows: 29339.936 s (real) 29087.88 s (cpu) 99% +Read rows/sec: 81799 +Read KB/s : 40899 + +real 1425m43.846s +user 1308m34.340s +sys 112m17.100s +-rw-r--r-- 1 falted users 2.7G 2004-02-04 02:25 /tmp/test-big-zlib-6 +.h5 + +The maximum amount of RAM taken by the test should be less than 300 MB (241 +MB when the test was running for 5750 minutes, which is the last time I've +check for it). + + +Another test with the same machine: + +time python /tmp/stress-test3.py -l zlib -c 6 -g400 -t 300 -i 100000 /tmp/test-big-zlib-6-2.h5 +ls -lh /tmp/test-big-zlib-6-2.h5 + +Compression level: 6 +Compression library: zlib +Rows written: 12000000000 Row size: 512 +Time writing rows: 262930.901 s (real) 262619.72 s (cpu) 100% +Write rows/sec: 45639 +Write KB/s : 22819 +Rows read: 12000000000 Row size: 512 Buf size: 49664 +Time reading rows: 143171.761 s (real) 141560.42 s (cpu) 99% +Read rows/sec: 83815 +Read KB/s : 41907 + +real 6768m34.076s +user 6183m38.690s +sys 552m51.150s +-rw-r--r-- 1 5350 users 11G 2004-02-09 00:57 /tmp/test-big-zlib-6 +-2.h5 diff --git a/bench/optimal-chunksize.py b/bench/optimal-chunksize.py new file mode 100644 index 0000000..fc071d4 --- /dev/null +++ b/bench/optimal-chunksize.py @@ -0,0 +1,125 @@ +"""Small benchmark on the effect of chunksizes and compression on HDF5 files. + +Francesc Alted +2007-11-25 + +""" + +import math +import subprocess +import tempfile +from pathlib import Path +from time import perf_counter as clock +import numpy as np +import tables as tb + +# Size of dataset +# N, M = 512, 2**16 # 256 MB +# N, M = 512, 2**18 # 1 GB +# N, M = 512, 2**19 # 2 GB +N, M = 2000, 1_000_000 # 15 GB +# N, M = 4000, 1000000 # 30 GB +datom = tb.Float64Atom() # elements are double precision + + +def quantize(data, least_significant_digit): + """Quantize data to improve compression. + + data is quantized using around(scale*data)/scale, where scale is + 2**bits, and bits is determined from the least_significant_digit. + For example, if least_significant_digit=1, bits will be 4. + + """ + + precision = 10 ** -least_significant_digit + exp = math.log(precision, 10) + if exp < 0: + exp = math.floor(exp) + else: + exp = math.ceil(exp) + bits = math.ceil(math.log(10 ** -exp, 2)) + scale = 2 ** bits + return np.around(scale * data) / scale + + +def get_db_size(filename): + sout = subprocess.Popen("ls -sh %s" % filename, shell=True, + stdout=subprocess.PIPE).stdout + line = [l for l in sout][0] + return line.split()[0] + + +def bench(chunkshape, filters): + np.random.seed(1) # to have reproductible results + filename = tempfile.mktemp(suffix='.h5') + print("Doing test on the file system represented by:", filename) + + f = tb.open_file(filename, 'w') + e = f.create_earray(f.root, 'earray', datom, shape=(0, M), + filters = filters, + chunkshape = chunkshape) + # Fill the array + t1 = clock() + for i in range(N): + # e.append([numpy.random.rand(M)]) # use this for less compressibility + e.append([quantize(np.random.rand(M), 6)]) + # os.system("sync") + print(f"Creation time: {clock() - t1:.3f}", end=' ') + filesize = get_db_size(filename) + filesize_bytes = Path(filename).stat().st_size + print("\t\tFile size: %d -- (%s)" % (filesize_bytes, filesize)) + + # Read in sequential mode: + e = f.root.earray + t1 = clock() + # Flush everything to disk and flush caches + #os.system("sync; echo 1 > /proc/sys/vm/drop_caches") + for row in e: + t = row + print(f"Sequential read time: {clock() - t1:.3f}", end=' ') + + # f.close() + # return + + # Read in random mode: + i_index = np.random.randint(0, N, 128) + j_index = np.random.randint(0, M, 256) + # Flush everything to disk and flush caches + #os.system("sync; echo 1 > /proc/sys/vm/drop_caches") + + # Protection against too large chunksizes + # 4 MB + if 0 and filters.complevel and chunkshape[0] * chunkshape[1] * 8 > 2 ** 22: + f.close() + return + + t1 = clock() + for i in i_index: + for j in j_index: + t = e[i, j] + print(f"\tRandom read time: {clock() - t1:.3f}") + + f.close() + +# Benchmark with different chunksizes and filters +# for complevel in (0, 1, 3, 6, 9): +for complib in (None, 'zlib', 'lzo', 'blosc'): +# for complib in ('blosc',): + if complib: + filters = tb.Filters(complevel=5, complib=complib) + else: + filters = tb.Filters(complevel=0) + print("8<--" * 20, "\nFilters:", filters, "\n" + "-" * 80) + # for ecs in (11, 14, 17, 20, 21, 22): + for ecs in range(10, 24): + # for ecs in (19,): + chunksize = 2 ** ecs + chunk1 = 1 + chunk2 = chunksize / datom.itemsize + if chunk2 > M: + chunk1 = chunk2 / M + chunk2 = M + chunkshape = (chunk1, chunk2) + cs_str = str(chunksize / 1024) + " KB" + print("***** Chunksize:", cs_str, "/ Chunkshape:", chunkshape, "*****") + bench(chunkshape, filters) diff --git a/bench/plot-bar.py b/bench/plot-bar.py new file mode 100644 index 0000000..70e9005 --- /dev/null +++ b/bench/plot-bar.py @@ -0,0 +1,117 @@ +#!/usr/bin/env python +# a stacked bar plot with errorbars + +from pathlib import Path +from pylab import * + +checks = ['open_close', 'only_open', + 'full_browse', 'partial_browse', + 'full_browse_attrs', 'partial_browse_attrs', + 'open_group', 'open_leaf', + 'total'] +width = 0.15 # the width of the bars: can also be len(x) sequence +colors = ['r', 'm', 'g', 'y', 'b'] +ind = arange(len(checks)) # the x locations for the groups + + +def get_values(filename): + values = [] + for line in Path(filename).read_text().splitlines(): + if show_memory: + if line.startswith('VmData:'): + values.append(float(line.split()[1]) / 1024) + else: + if line.startswith('WallClock time:'): + values.append(float(line.split(':')[1])) + return values + + +def plot_bar(values, n): + global ind + if not gtotal: + # Remove the grand totals + values.pop() + if n == 0: + checks.pop() + ind = arange(len(checks)) + p = bar(ind + width * n, values, width, color=colors[n]) + return p + + +def show_plot(bars, filenames, tit): + if show_memory: + ylabel('Memory (MB)') + else: + ylabel('Time (s)') + title(tit) + n = len(filenames) + xticks(ind + width * n / 2, checks, rotation=45, + horizontalalignment='right', fontsize=8) + if not gtotal: + #loc = 'center right' + loc = 'upper left' + else: + loc = 'center left' + + legends = [f[:f.index('_')] for f in filenames] + legends = [l.replace('-', ' ') for l in legends] + legend([p[0] for p in bars], legends, loc=loc) + + subplots_adjust(bottom=0.2, top=None, wspace=0.2, hspace=0.2) + if outfile: + savefig(outfile) + else: + show() + +if __name__ == '__main__': + + import sys + import getopt + + usage = """usage: %s [-g] [-m] [-o file] [-t title] files + -g grand total + -m show memory instead of time + -o filename for output (only .png and .jpg extensions supported) + -t title of the plot + \n""" % sys.argv[0] + + try: + opts, pargs = getopt.getopt(sys.argv[1:], 'gmo:t:') + except: + sys.stderr.write(usage) + sys.exit(0) + + progname = sys.argv[0] + args = sys.argv[1:] + + # if we pass too few parameters, abort + if len(pargs) < 1: + sys.stderr.write(usage) + sys.exit(0) + + # default options + tit = "Comparison of differents PyTables versions" + gtotal = 0 + show_memory = 0 + outfile = None + + # Get the options + for option in opts: + if option[0] == '-g': + gtotal = 1 + elif option[0] == '-m': + show_memory = 1 + elif option[0] == '-o': + outfile = option[1] + elif option[0] == '-t': + tit = option[1] + + filenames = pargs + bars = [] + n = 0 + for filename in filenames: + values = get_values(filename) + print("Values-->", values) + bars.append(plot_bar(values, n)) + n += 1 + show_plot(bars, filenames, tit) diff --git a/bench/plot-comparison-lzo-zlib-ucl.gnuplot b/bench/plot-comparison-lzo-zlib-ucl.gnuplot new file mode 100644 index 0000000..bda1f52 --- /dev/null +++ b/bench/plot-comparison-lzo-zlib-ucl.gnuplot @@ -0,0 +1,27 @@ +#set term post color +set term post eps color +set xlabel "Number of rows" +set ylabel "Speed (Krow/s)" + +set linestyle 1 lw 7 +set linestyle 2 lw 7 +set linestyle 3 lw 7 +set linestyle 4 lw 7 +set logscale x + +# For small record size +set output "read-small-lzo-zlib-ucl-comparison.eps" +set tit "Selecting with small record size (16 bytes)" +pl [1000:] [0:1000] "small-nc.out" u ($1):($10) t "No compression" w linesp ls 1, \ + "small-zlib.out" u ($1):($10) t "ZLIB" w linesp ls 2, \ + "small-lzo.out" u ($1):($10) t "LZO" w linesp ls 3, \ + "small-ucl.out" u ($1):($10) t "UCL" w linesp ls 4 + +# For small record size +set output "write-small-lzo-zlib-ucl-comparison.eps" +set tit "Writing with small record size (16 bytes)" +pl [1000:] [0:500] "small-nc.out" u ($1):($5) tit "No compression" w linesp ls 1, \ + "small-zlib.out" u ($1):($5) tit "ZLIB" w linesp ls 2, \ + "small-lzo.out" u ($1):($5) tit "LZO" w linesp ls 3, \ + "small-ucl.out" u ($1):($5) tit "UCL" w linesp ls 4 + diff --git a/bench/plot-comparison-psyco-lzo.gnuplot b/bench/plot-comparison-psyco-lzo.gnuplot new file mode 100644 index 0000000..fb71de9 --- /dev/null +++ b/bench/plot-comparison-psyco-lzo.gnuplot @@ -0,0 +1,28 @@ +#set term post color +set term post eps color +set xlabel "Number of rows" +set ylabel "Speed (Krow/s)" + +set linestyle 1 lw 7 +set linestyle 2 lw 7 +set linestyle 3 lw 7 +set linestyle 4 lw 7 + +# For small record size +set output "read-small-psyco-lzo-comparison.eps" +set tit "Selecting with small record size (16 bytes)" +set logscale x +pl [1000:] [0:1200] "small-psyco-lzo.out" u ($1):($10) t "Psyco & compression (LZO)" w linesp ls 2, \ + "small-psyco-nc.out" u ($1):($10) tit "Psyco & no compresion" w linesp ls 3, \ + "small-lzo.out" u ($1):($10) t "No Psyco & compression (LZO)" w linesp ls 1, \ + "small-nc.out" u ($1):($10) tit "No Psyco & no compression" w linesp ls 4 + +# For small record size +set output "write-small-psyco-lzo-comparison.eps" +set tit "Writing with small record size (16 bytes)" +set logscale x +pl [1000:] [0:1000] "small-psyco-lzo.out" u ($1):($5) t "Psyco & compression (LZO)" w linesp ls 2, \ + "small-psyco-nc.out" u ($1):($5) tit "Psyco & no compresion" w linesp ls 3, \ + "small-lzo.out" u ($1):($5) t "No Psyco & compression (LZO)" w linesp ls 1, \ + "small-nc.out" u ($1):($5) tit "No Psyco & no compression" w linesp ls 4 + diff --git a/bench/poly.py b/bench/poly.py new file mode 100644 index 0000000..e80a93b --- /dev/null +++ b/bench/poly.py @@ -0,0 +1,188 @@ +"""This script compares the speed of the computation of a polynomial for +different (numpy.memmap and tables.Expr) out-of-memory paradigms.""" + +from pathlib import Path +from time import perf_counter as clock + +import numpy as np +import tables as tb +import numexpr as ne + +expr = ".25*x**3 + .75*x**2 - 1.5*x - 2" # the polynomial to compute +N = 10 * 1000 * 1000 # the number of points to compute expression (80 MB) +step = 100 * 1000 # perform calculation in slices of `step` elements +dtype = np.dtype('f8') # the datatype +#CHUNKSHAPE = (2**17,) +CHUNKSHAPE = None + +# Global variable for the x values for pure numpy & numexpr +x = None + +# *** The next variables do not need to be changed *** + +# Filenames for numpy.memmap +fprefix = "numpy.memmap" # the I/O file prefix +mpfnames = [fprefix + "-x.bin", fprefix + "-r.bin"] + +# Filename for tables.Expr +h5fname = "tablesExpr.h5" # the I/O file + +MB = 1024 * 1024 # a MegaByte + + +def print_filesize(filename, clib=None, clevel=0): + """Print some statistics about file sizes.""" + + # os.system("sync") # make sure that all data has been flushed to disk + if isinstance(filename, list): + filesize_bytes = sum(Path(fname).stat().st_size for fname in filename) + else: + filesize_bytes = Path(filename).stat().st_size + print( + f"\t\tTotal file sizes: {filesize_bytes} -- " + f"({filesize_bytes / MB:.1f} MB)", end=' ') + if clevel > 0: + print(f"(using {clib} lvl{clevel})") + else: + print() + + +def populate_x_numpy(): + """Populate the values in x axis for numpy.""" + global x + # Populate x in range [-1, 1] + x = np.linspace(-1, 1, N) + + +def populate_x_memmap(): + """Populate the values in x axis for numpy.memmap.""" + # Create container for input + x = np.memmap(mpfnames[0], dtype=dtype, mode="w+", shape=(N,)) + + # Populate x in range [-1, 1] + for i in range(0, N, step): + chunk = np.linspace((2 * i - N) / N, + (2 * (i + step) - N) / N, step) + x[i:i + step] = chunk + del x # close x memmap + + +def populate_x_tables(clib, clevel): + """Populate the values in x axis for pytables.""" + f = tb.open_file(h5fname, "w") + + # Create container for input + atom = tb.Atom.from_dtype(dtype) + filters = tb.Filters(complib=clib, complevel=clevel) + x = f.create_carray(f.root, "x", atom=atom, shape=(N,), + filters=filters, + chunkshape=CHUNKSHAPE, + ) + + # Populate x in range [-1, 1] + for i in range(0, N, step): + chunk = np.linspace((2 * i - N) / N, + (2 * (i + step) - N) / N, step) + x[i:i + step] = chunk + f.close() + + +def compute_numpy(): + """Compute the polynomial with pure numpy.""" + y = eval(expr) + + +def compute_numexpr(): + """Compute the polynomial with pure numexpr.""" + y = ne.evaluate(expr) + + +def compute_memmap(): + """Compute the polynomial with numpy.memmap.""" + # Reopen inputs in read-only mode + x = np.memmap(mpfnames[0], dtype=dtype, mode='r', shape=(N,)) + # Create the array output + r = np.memmap(mpfnames[1], dtype=dtype, mode="w+", shape=(N,)) + + # Do the computation by chunks and store in output + r[:] = eval(expr) # where is stored the result? + # r = eval(expr) # result is stored in-memory + + del x, r # close x and r memmap arrays + print_filesize(mpfnames) + + +def compute_tables(clib, clevel): + """Compute the polynomial with tables.Expr.""" + f = tb.open_file(h5fname, "a") + x = f.root.x # get the x input + # Create container for output + atom = tb.Atom.from_dtype(dtype) + filters = tb.Filters(complib=clib, complevel=clevel) + r = f.create_carray(f.root, "r", atom=atom, shape=(N,), + filters=filters, + chunkshape=CHUNKSHAPE, + ) + + # Do the actual computation and store in output + ex = tb.Expr(expr) # parse the expression + ex.set_output(r) # where is stored the result? + # when commented out, the result goes in-memory + ex.eval() # evaluate! + + f.close() + print_filesize(h5fname, clib, clevel) + + +if __name__ == '__main__': + + tb.print_versions() + + print(f"Total size for datasets: {2 * N * dtype.itemsize / MB:.1f} MB") + + # Get the compression libraries supported + # supported_clibs = [clib for clib in ("zlib", "lzo", "bzip2", "blosc") + # supported_clibs = [clib for clib in ("zlib", "lzo", "blosc") + supported_clibs = [clib for clib in ("blosc",) + if tb.which_lib_version(clib)] + + # Initialization code + # for what in ["numpy", "numpy.memmap", "numexpr"]: + for what in ["numpy", "numexpr"]: + # break + print("Populating x using %s with %d points..." % (what, N)) + t0 = clock() + if what == "numpy": + populate_x_numpy() + compute = compute_numpy + elif what == "numexpr": + populate_x_numpy() + compute = compute_numexpr + elif what == "numpy.memmap": + populate_x_memmap() + compute = compute_memmap + print(f"*** Time elapsed populating: {clock() - t0:.3f}") + print(f"Computing: {expr!r} using {what}") + t0 = clock() + compute() + print(f"**************** Time elapsed computing: {clock() - t0:.3f}") + + for what in ["tables.Expr"]: + t0 = clock() + first = True # Sentinel + for clib in supported_clibs: + # for clevel in (0, 1, 3, 6, 9): + for clevel in range(10): + # for clevel in (1,): + if not first and clevel == 0: + continue + print("Populating x using %s with %d points..." % (what, N)) + populate_x_tables(clib, clevel) + print(f"*** Time elapsed populating: {clock() - t0:.3f}") + print(f"Computing: {expr!r} using {what}") + t0 = clock() + compute_tables(clib, clevel) + print( + f"**************** Time elapsed computing: " + f"{clock() - t0:.3f}") + first = False diff --git a/bench/postgres-search-bench.py b/bench/postgres-search-bench.py new file mode 100644 index 0000000..e08999c --- /dev/null +++ b/bench/postgres-search-bench.py @@ -0,0 +1,247 @@ +from time import perf_counter as clock +import numpy as np +import random + +DSN = "dbname=test port = 5435" + +# in order to always generate the same random sequence +random.seed(19) + + +def flatten(l): + """Flattens list of tuples l.""" + return [x[0] for x in l] + + +def fill_arrays(start, stop): + col_i = np.arange(start, stop, type=np.int32) + if userandom: + col_j = np.random.uniform(0, nrows, size=[stop - start]) + else: + col_j = np.array(col_i, type=np.float64) + return col_i, col_j + +# Generator for ensure pytables benchmark compatibility + + +def int_generator(nrows): + step = 1000 * 100 + j = 0 + for i in range(nrows): + if i >= step * j: + stop = (j + 1) * step + if stop > nrows: # Seems unnecessary + stop = nrows + col_i, col_j = fill_arrays(i, stop) + j += 1 + k = 0 + yield (col_i[k], col_j[k]) + k += 1 + + +def int_generator_slow(nrows): + for i in range(nrows): + if userandom: + yield (i, float(random.randint(0, nrows))) + else: + yield (i, float(i)) + + +class Stream32: + + "Object simulating a file for reading" + + def __init__(self): + self.n = None + self.read_it = self.read_iter() + + # No va! Hi ha que convertir a un de normal! + def readline(self, n=None): + for tup in int_generator(nrows): + sout = "%s\t%s\n" % tup + if n is not None and len(sout) > n: + for i in range(0, len(sout), n): + yield sout[i:i + n] + else: + yield sout + + def read_iter(self): + sout = "" + n = self.n + for tup in int_generator(nrows): + sout += "%s\t%s\n" % tup + if n is not None and len(sout) > n: + for i in range(n, len(sout), n): + rout = sout[:n] + sout = sout[n:] + yield rout + yield sout + + def read(self, n=None): + self.n = n + try: + str = next(self.read_it) + except StopIteration: + str = "" + return str + + +def open_db(filename, remove=0): + if not filename: + con = sqlite.connect(DSN) + else: + con = sqlite.connect(filename) + cur = con.cursor() + return con, cur + + +def create_db(filename, nrows): + con, cur = open_db(filename, remove=1) + try: + cur.execute("create table ints(i integer, j double precision)") + except: + con.rollback() + cur.execute("DROP TABLE ints") + cur.execute("create table ints(i integer, j double precision)") + con.commit() + con.set_isolation_level(2) + t1 = clock() + st = Stream32() + cur.copy_from(st, "ints") + # In case of postgres, the speeds of generator and loop are similar + #cur.executemany("insert into ints values (%s,%s)", int_generator(nrows)) +# for i in xrange(nrows): +# cur.execute("insert into ints values (%s,%s)", (i, float(i))) + con.commit() + ctime = clock() - t1 + if verbose: + print(f"insert time: {ctime:.5f}") + print(f"Krows/s: {nrows / 1000 / ctime:.5f}") + close_db(con, cur) + + +def index_db(filename): + con, cur = open_db(filename) + t1 = clock() + cur.execute("create index ij on ints(j)") + con.commit() + itime = clock() - t1 + if verbose: + print(f"index time: {itime:.5f}") + print(f"Krows/s: {nrows / itime:.5f}") + # Close the DB + close_db(con, cur) + + +def query_db(filename, rng): + con, cur = open_db(filename) + t1 = clock() + ntimes = 10 + for i in range(ntimes): + # between clause does not seem to take advantage of indexes + # cur.execute("select j from ints where j between %s and %s" % \ + cur.execute("select i from ints where j >= %s and j <= %s" % + # cur.execute("select i from ints where i >= %s and i <= + # %s" % + (rng[0] + i, rng[1] + i)) + results = cur.fetchall() + con.commit() + qtime = (clock() - t1) / ntimes + if verbose: + print(f"query time: {qtime:.5f}") + print(f"Mrows/s: {nrows / 1000 / qtime:.5f}") + results = sorted(flatten(results)) + print(results) + close_db(con, cur) + + +def close_db(con, cur): + cur.close() + con.close() + +if __name__ == "__main__": + import sys + import getopt + try: + import psyco + psyco_imported = 1 + except: + psyco_imported = 0 + + usage = """usage: %s [-v] [-p] [-m] [-i] [-q] [-c] [-R range] [-n nrows] file + -v verbose + -p use "psyco" if available + -m use random values to fill the table + -q do query + -c create the database + -i index the table + -2 use sqlite2 (default is use sqlite3) + -R select a range in a field in the form "start,stop" (def "0,10") + -n sets the number of rows (in krows) in each table + \n""" % sys.argv[0] + + try: + opts, pargs = getopt.getopt(sys.argv[1:], 'vpmiqc2R:n:') + except: + sys.stderr.write(usage) + sys.exit(0) + + # default options + verbose = 0 + usepsyco = 0 + userandom = 0 + docreate = 0 + createindex = 0 + doquery = 0 + sqlite_version = "3" + rng = [0, 10] + nrows = 1 + + # Get the options + for option in opts: + if option[0] == '-v': + verbose = 1 + elif option[0] == '-p': + usepsyco = 1 + elif option[0] == '-m': + userandom = 1 + elif option[0] == '-i': + createindex = 1 + elif option[0] == '-q': + doquery = 1 + elif option[0] == '-c': + docreate = 1 + elif option[0] == "-2": + sqlite_version = "2" + elif option[0] == '-R': + rng = [int(i) for i in option[1].split(",")] + elif option[0] == '-n': + nrows = int(option[1]) + + # Catch the hdf5 file passed as the last argument + filename = pargs[0] + +# if sqlite_version == "2": +# import sqlite +# else: +# from pysqlite2 import dbapi2 as sqlite + import psycopg2 as sqlite + + if verbose: + # print "pysqlite version:", sqlite.version + if userandom: + print("using random values") + + if docreate: + if verbose: + print("writing %s krows" % nrows) + if psyco_imported and usepsyco: + psyco.bind(create_db) + nrows *= 1000 + create_db(filename, nrows) + + if createindex: + index_db(filename) + + if doquery: + query_db(filename, rng) diff --git a/bench/postgres_backend.py b/bench/postgres_backend.py new file mode 100644 index 0000000..c547e74 --- /dev/null +++ b/bench/postgres_backend.py @@ -0,0 +1,155 @@ +import subprocess +from indexed_search import DB +import psycopg2 as db2 + +CLUSTER_NAME = "base" +DATA_DIR = "/scratch2/postgres/data/%s" % CLUSTER_NAME +#DATA_DIR = "/var/lib/pgsql/data/%s" % CLUSTER_NAME +DSN = "dbname=%s port=%s" +CREATE_DB = "createdb %s" +DROP_DB = "dropdb %s" +TABLE_NAME = "intsfloats" +PORT = 5432 + + +class StreamChar: + "Object simulating a file for reading" + + def __init__(self, db): + self.db = db + self.nrows = db.nrows + self.step = db.step + self.read_it = self.read_iter() + + def values_generator(self): + j = 0 + for i in range(self.nrows): + if i >= j * self.step: + stop = (j + 1) * self.step + if stop > self.nrows: + stop = self.nrows + arr_i4, arr_f8 = self.db.fill_arrays(i, stop) + j += 1 + k = 0 + yield (arr_i4[k], arr_i4[k], arr_f8[k], arr_f8[k]) + k += 1 + + def read_iter(self): + sout = "" + n = self.nbytes + for tup in self.values_generator(): + sout += "%s\t%s\t%s\t%s\n" % tup + if n is not None and len(sout) > n: + for i in range(n, len(sout), n): + rout = sout[:n] + sout = sout[n:] + yield rout + yield sout + + def read(self, n=None): + self.nbytes = n + try: + str = next(self.read_it) + except StopIteration: + str = "" + return str + + # required by postgres2 driver, but not used + def readline(self): + pass + + +class Postgres_DB(DB): + + def __init__(self, nrows, rng, userandom): + DB.__init__(self, nrows, rng, userandom) + self.port = PORT + + def flatten(self, l): + """Flattens list of tuples l.""" + return [x[0] for x in l] + # return map(lambda x: x[col], l) + + # Overloads the method in DB class + def get_db_size(self): + sout = subprocess.Popen("sudo du -s %s" % DATA_DIR, + shell=True, + stdout=subprocess.PIPE).stdout + line = [l for l in sout][0] + return int(line.split()[0]) + + def open_db(self, remove=0): + if remove: + sout = subprocess.Popen(DROP_DB % self.filename, shell=True, + stdout=subprocess.PIPE).stdout + for line in sout: + print(line) + sout = subprocess.Popen(CREATE_DB % self.filename, shell=True, + stdout=subprocess.PIPE).stdout + for line in sout: + print(line) + + print("Processing database:", self.filename) + con = db2.connect(DSN % (self.filename, self.port)) + self.cur = con.cursor() + return con + + def create_table(self, con): + self.cur.execute("""create table %s( + col1 integer, + col2 integer, + col3 double precision, + col4 double precision)""" % TABLE_NAME) + con.commit() + + def fill_table(self, con): + st = StreamChar(self) + self.cur.copy_from(st, TABLE_NAME) + con.commit() + + def index_col(self, con, colname, optlevel, idxtype, verbose): + self.cur.execute("create index %s on %s(%s)" % + (colname + '_idx', TABLE_NAME, colname)) + con.commit() + + def do_query_simple(self, con, column, base): + self.cur.execute( + "select sum(%s) from %s where %s >= %s and %s <= %s" % + (column, TABLE_NAME, + column, base + self.rng[0], + column, base + self.rng[1])) +# "select * from %s where %s >= %s and %s <= %s" % \ +# (TABLE_NAME, +# column, base+self.rng[0], +# column, base+self.rng[1])) + #results = self.flatten(self.cur.fetchall()) + results = self.cur.fetchall() + return results + + def do_query(self, con, column, base, *unused): + d = (self.rng[1] - self.rng[0]) / 2 + inf1 = int(self.rng[0] + base) + sup1 = int(self.rng[0] + d + base) + inf2 = self.rng[0] + base * 2 + sup2 = self.rng[0] + d + base * 2 + # print "lims-->", inf1, inf2, sup1, sup2 + condition = "((%s>=%s) and (%s<%s)) or ((col2>%s) and (col2<%s))" + #condition = "((col3>=%s) and (col3<%s)) or ((col1>%s) and (col1<%s))" + condition += " and ((col1+3.1*col2+col3*col4) > 3)" + #condition += " and (sqrt(col1^2+col2^2+col3^2+col4^2) > .1)" + condition = condition % (column, inf2, column, sup2, inf1, sup1) + # print "condition-->", condition + self.cur.execute( + # "select sum(%s) from %s where %s" % + "select %s from %s where %s" % + (column, TABLE_NAME, condition)) + #results = self.flatten(self.cur.fetchall()) + results = self.cur.fetchall() + #results = self.cur.fetchall() + # print "results-->", results + # return results + return len(results) + + def close_db(self, con): + self.cur.close() + con.close() diff --git a/bench/pytables-search-bench.py b/bench/pytables-search-bench.py new file mode 100644 index 0000000..f9835d7 --- /dev/null +++ b/bench/pytables-search-bench.py @@ -0,0 +1,221 @@ +import random +from pathlib import Path +from time import perf_counter as clock + +import numpy as np +import tables as tb + +# in order to always generate the same random sequence +random.seed(19) +np.random.seed((19, 20)) + + +def open_db(filename, remove=0): + if remove and Path(filename).is_file(): + Path(filename).unlink() + con = tb.open_file(filename, 'a') + return con + + +def create_db(filename, nrows): + + class Record(tb.IsDescription): + col1 = tb.Int32Col() + col2 = tb.Int32Col() + col3 = tb.Float64Col() + col4 = tb.Float64Col() + + con = open_db(filename, remove=1) + table = con.create_table(con.root, 'table', Record, + filters=filters, expectedrows=nrows) + table.indexFilters = filters + step = 1000 * 100 + scale = 0.1 + t1 = clock() + j = 0 + for i in range(0, nrows, step): + stop = (j + 1) * step + if stop > nrows: + stop = nrows + arr_f8 = np.arange(i, stop, type=np.float64) + arr_i4 = np.arange(i, stop, type=np.int32) + if userandom: + arr_f8 += np.random.normal(0, stop * scale, shape=[stop - i]) + arr_i4 = np.array(arr_f8, type=np.int32) + recarr = np.rec.fromarrays([arr_i4, arr_i4, arr_f8, arr_f8]) + table.append(recarr) + j += 1 + table.flush() + ctime = clock() - t1 + if verbose: + print(f"insert time: {ctime:.5f}") + print(f"Krows/s: {nrows / 1000 / ctime:.5f}") + index_db(table) + close_db(con) + + +def index_db(table): + t1 = clock() + table.cols.col2.create_index() + itime = clock() - t1 + if verbose: + print(f"index time (int): {itime:.5f}") + print(f"Krows/s: {nrows / 1000 / itime:.5f}") + t1 = clock() + table.cols.col4.create_index() + itime = clock() - t1 + if verbose: + print(f"index time (float): {itime:.5f}") + print(f"Krows/s: {nrows / 1000 / itime:.5f}") + + +def query_db(filename, rng): + con = open_db(filename) + table = con.root.table + # Query for integer columns + # Query for non-indexed column + if not doqueryidx: + t1 = clock() + ntimes = 10 + for i in range(ntimes): + results = [ + r['col1'] for r in table.where( + rng[0] + i <= table.cols.col1 <= rng[1] + i) + ] + qtime = (clock() - t1) / ntimes + if verbose: + print(f"query time (int, not indexed): {qtime:.5f}") + print(f"Krows/s: {nrows / 1000 / qtime:.5f}") + print(results) + # Query for indexed column + t1 = clock() + ntimes = 10 + for i in range(ntimes): + results = [ + r['col1'] for r in table.where( + rng[0] + i <= table.cols.col2 <= rng[1] + i) + ] + qtime = (clock() - t1) / ntimes + if verbose: + print(f"query time (int, indexed): {qtime:.5f}") + print(f"Krows/s: {nrows / 1000 / qtime:.5f}") + print(results) + # Query for floating columns + # Query for non-indexed column + if not doqueryidx: + t1 = clock() + ntimes = 10 + for i in range(ntimes): + results = [ + r['col3'] for r in table.where( + rng[0] + i <= table.cols.col3 <= rng[1] + i) + ] + qtime = (clock() - t1) / ntimes + if verbose: + print(f"query time (float, not indexed): {qtime:.5f}") + print(f"Krows/s: {nrows / 1000 / qtime:.5f}") + print(results) + # Query for indexed column + t1 = clock() + ntimes = 10 + for i in range(ntimes): + results = [r['col3'] for r in + table.where(rng[0] + i <= table.cols.col4 <= rng[1] + i)] + qtime = (clock() - t1) / ntimes + if verbose: + print(f"query time (float, indexed): {qtime:.5f}") + print(f"Krows/s: {nrows / 1000 / qtime:.5f}") + print(results) + close_db(con) + + +def close_db(con): + con.close() + +if __name__ == "__main__": + import sys + import getopt + try: + import psyco + psyco_imported = 1 + except: + psyco_imported = 0 + + usage = """usage: %s [-v] [-p] [-m] [-c] [-q] [-i] [-z complevel] [-l complib] [-R range] [-n nrows] file + -v verbose + -p use "psyco" if available + -m use random values to fill the table + -q do a query (both indexed and non-indexed version) + -i do a query (exclude non-indexed version) + -c create the database + -z compress with zlib (no compression by default) + -l use complib for compression (zlib used by default) + -R select a range in a field in the form "start,stop" (def "0,10") + -n sets the number of rows (in krows) in each table + \n""" % sys.argv[0] + + try: + opts, pargs = getopt.getopt(sys.argv[1:], 'vpmcqiz:l:R:n:') + except: + sys.stderr.write(usage) + sys.exit(0) + + # default options + verbose = 0 + usepsyco = 0 + userandom = 0 + docreate = 0 + docompress = 0 + complib = "zlib" + doquery = 0 + doqueryidx = 0 + rng = [0, 10] + nrows = 1 + + # Get the options + for option in opts: + if option[0] == '-v': + verbose = 1 + elif option[0] == '-p': + usepsyco = 1 + elif option[0] == '-m': + userandom = 1 + elif option[0] == '-c': + docreate = 1 + createindex = 1 + elif option[0] == '-q': + doquery = 1 + elif option[0] == '-i': + doqueryidx = 1 + elif option[0] == '-z': + docompress = int(option[1]) + elif option[0] == '-l': + complib = option[1] + elif option[0] == '-R': + rng = [int(i) for i in option[1].split(",")] + elif option[0] == '-n': + nrows = int(option[1]) + + # Catch the hdf5 file passed as the last argument + filename = pargs[0] + + # The filters chosen + filters = tb.Filters(complevel=docompress, complib=complib) + + if verbose: + print("pytables version:", tb.__version__) + if userandom: + print("using random values") + if doqueryidx: + print("doing indexed queries only") + + if docreate: + if verbose: + print("writing %s krows" % nrows) + if psyco_imported and usepsyco: + psyco.bind(create_db) + nrows *= 1000 + create_db(filename, nrows) + + if doquery: + query_db(filename, rng) diff --git a/bench/pytables_backend.py b/bench/pytables_backend.py new file mode 100644 index 0000000..60567b4 --- /dev/null +++ b/bench/pytables_backend.py @@ -0,0 +1,191 @@ +import os +from pathlib import Path + +import tables as tb +from indexed_search import DB + + +class PyTables_DB(DB): + + def __init__(self, nrows, rng, userandom, datadir, + docompress=0, complib='zlib', kind="medium", optlevel=6): + DB.__init__(self, nrows, rng, userandom) + self.tprof = [] + # Specific part for pytables + self.docompress = docompress + self.complib = complib + # Complete the filename + self.filename = "pro-" + self.filename + self.filename += '-' + 'O%s' % optlevel + self.filename += '-' + kind + if docompress: + self.filename += '-' + complib + str(docompress) + self.datadir = datadir + path = Path(self.datadir) + if not path.is_dir(): + if not path.is_absolute(): + dir_path = Path('.') / self.datadir + else: + dir_path = Path(self.datadir) + dir_path.mkdir(parents=True, exist_ok=True) + self.datadir = dir_path + print(f"Created {self.datadir}.") + self.filename = self.datadir / f'{self.filename}.h5' + # The chosen filters + self.filters = tb.Filters(complevel=self.docompress, + complib=self.complib, + shuffle=1) + print("Processing database:", self.filename) + + def open_db(self, remove=0): + if remove and Path(self.filename).is_file(): + Path(self.filename).unlink() + con = tb.open_file(self.filename, 'a') + return con + + def close_db(self, con): + # Remove first the table_cache attribute if it exists + if hasattr(self, "table_cache"): + del self.table_cache + con.close() + + def create_table(self, con): + class Record(tb.IsDescription): + col1 = tb.Int32Col() + col2 = tb.Int32Col() + col3 = tb.Float64Col() + col4 = tb.Float64Col() + + con.create_table(con.root, 'table', Record, + filters=self.filters, expectedrows=self.nrows) + + def fill_table(self, con): + "Fills the table" + table = con.root.table + j = 0 + for i in range(0, self.nrows, self.step): + stop = (j + 1) * self.step + if stop > self.nrows: + stop = self.nrows + arr_i4, arr_f8 = self.fill_arrays(i, stop) +# recarr = records.fromarrays([arr_i4, arr_i4, arr_f8, arr_f8]) +# table.append(recarr) + table.append([arr_i4, arr_i4, arr_f8, arr_f8]) + j += 1 + table.flush() + + def index_col(self, con, column, kind, optlevel, verbose): + col = getattr(con.root.table.cols, column) + tmp_dir = self.datadir / "scratch2" + tmp_dir.mkdir(parents=True, exist_ok=True) + col.create_index(kind=kind, optlevel=optlevel, filters=self.filters, + tmp_dir=tmp_dir, _verbose=verbose, _blocksizes=None) +# _blocksizes=(2**27, 2**22, 2**15, 2**7)) +# _blocksizes=(2**27, 2**22, 2**14, 2**6)) +# _blocksizes=(2**27, 2**20, 2**13, 2**5), +# _testmode=True) + + def do_query(self, con, column, base, inkernel): + if True: + if not hasattr(self, "table_cache"): + self.table_cache = table = con.root.table + self.colobj = getattr(table.cols, column) + #self.colobj = getattr(table.cols, 'col1') + self.condvars = {"col": self.colobj, + "col1": table.cols.col1, + "col2": table.cols.col2, + "col3": table.cols.col3, + "col4": table.cols.col4, + } + table = self.table_cache + colobj = self.colobj + else: + table = con.root.table + colobj = getattr(table.cols, column) + self.condvars = {"col": colobj, + "col1": table.cols.col1, + "col2": table.cols.col2, + "col3": table.cols.col3, + "col4": table.cols.col4, + } + self.condvars['inf'] = self.rng[0] + base + self.condvars['sup'] = self.rng[1] + base + # For queries that can use two indexes instead of just one + d = (self.rng[1] - self.rng[0]) / 2 + inf1 = int(self.rng[0] + base) + sup1 = int(self.rng[0] + d + base) + inf2 = self.rng[0] + base * 2 + sup2 = self.rng[0] + d + base * 2 + self.condvars['inf1'] = inf1 + self.condvars['sup1'] = sup1 + self.condvars['inf2'] = inf2 + self.condvars['sup2'] = sup2 + #condition = "(inf == col2)" + #condition = "(inf==col2) & (col4==sup)" + #condition = "(inf==col2) | (col4==sup)" + #condition = "(inf==col2) | (col2==sup)" + #condition = "(inf==col2) & (col3==sup)" + #condition = "((inf==col2) & (sup==col4)) & (col3==sup)" + #condition = "((inf==col1) & (sup==col4)) & (col3==sup)" + #condition = "(inf<=col1) & (col3", inf1, inf2, sup1, sup2 + condition = "((inf2<=col) & (col", c['inf'], c['sup'], c['inf2'], c['sup2'] + ncoords = 0 + if colobj.is_indexed: + results = [r[column] + for r in table.where(condition, self.condvars)] +# coords = table.get_where_list(condition, self.condvars) +# results = table.read_coordinates(coords, field=column) + +# results = table.read_where(condition, self.condvars, field=column) + + elif inkernel: + print("Performing in-kernel query") + results = [r[column] + for r in table.where(condition, self.condvars)] + #coords = [r.nrow for r in table.where(condition, self.condvars)] + #results = table.read_coordinates(coords) +# for r in table.where(condition, self.condvars): +# var = r[column] +# ncoords += 1 + else: +# coords = [r.nrow for r in table +# if (self.rng[0]+base <= r[column] <= self.rng[1]+base)] +# results = table.read_coordinates(coords) + print("Performing regular query") + results = [ + r[column] for r in table if (( + (inf2 <= r['col4']) and (r['col4'] < sup2)) or + ((inf1 < r['col2']) and (r['col2'] < sup1)) and + ((r['col1'] + 3.1 * r['col2'] + r['col3'] * r['col4']) > 3) + )] + + ncoords = len(results) + + # return coords + # print "results-->", results + # return results + return ncoords + #self.tprof.append( self.colobj.index.tprof ) + # return ncoords, self.tprof diff --git a/bench/recarray2-test.py b/bench/recarray2-test.py new file mode 100644 index 0000000..a9f4f97 --- /dev/null +++ b/bench/recarray2-test.py @@ -0,0 +1,101 @@ +import sys +from pathlib import Path +from time import perf_counter as clock +import numpy as np +import chararray +import recarray +import recarray2 # This is my modified version + +usage = """usage: %s recordlength + Set recordlength to 1000 at least to obtain decent figures! +""" % sys.argv[0] + +try: + reclen = int(sys.argv[1]) +except: + print(usage) + sys.exit() + +delta = 0.000_001 + +# Creation of recarrays objects for test +x1 = np.array(np.arange(reclen)) +x2 = chararray.array(None, itemsize=7, shape=reclen) +x3 = np.array(np.arange(reclen, reclen * 3, 2), np.float64) +r1 = recarray.fromarrays([x1, x2, x3], names='a,b,c') +r2 = recarray2.fromarrays([x1, x2, x3], names='a,b,c') + +print("recarray shape in test ==>", r2.shape) + +print("Assignment in recarray original") +print("-------------------------------") +t1 = clock() +for row in range(reclen): + #r1.field("b")[row] = "changed" + r1.field("c")[row] = float(row ** 2) +t2 = clock() +origtime = t2 - t1 +print(f"Assign time: {origtime:.3f} Rows/s: {reclen / (origtime + delta):.0f}") +# print "Field b on row 2 after re-assign:", r1.field("c")[2] +print() + +print("Assignment in recarray modified") +print("-------------------------------") +t1 = clock() +for row in range(reclen): + rec = r2._row(row) # select the row to be changed + # rec.b = "changed" # change the "b" field + rec.c = float(row ** 2) # Change the "c" field +t2 = clock() +ttime = t2 - t1 +print(f"Assign time: {ttime:.3f} Rows/s: {reclen / (ttime + delta):.0f}", end=' ') +print(f" Speed-up: {origtime / ttime:.3f}") +# print "Field b on row 2 after re-assign:", r2.field("c")[2] +print() + +print("Selection in recarray original") +print("------------------------------") +t1 = clock() +for row in range(reclen): + rec = r1[row] + if rec.field("a") < 3: + print("This record pass the cut ==>", rec.field("c"), "(row", row, ")") +t2 = clock() +origtime = t2 - t1 +print(f"Select time: {origtime:.3f}, Rows/s: {reclen / (origtime + delta):.0f}") +print() + +print("Selection in recarray modified") +print("------------------------------") +t1 = clock() +for row in range(reclen): + rec = r2._row(row) + if rec.a < 3: + print("This record pass the cut ==>", rec.c, "(row", row, ")") +t2 = clock() +ttime = t2 - t1 +print(f"Select time: {ttime:.3f} Rows/s: {reclen / (ttime + delta):.0f}", end=' ') +print(f" Speed-up: {origtime / ttime:.3f}") +print() + +print("Printing in recarray original") +print("------------------------------") +with Path("test.out").open("w") as f: + t1 = clock() + f.write(str(r1)) + t2 = clock() + origtime = t2 - t1 +Path("test.out").unlink() +print(f"Print time: {origtime:.3f} Rows/s: {reclen / (origtime + delta):.0f}") +print() +print("Printing in recarray modified") +print("------------------------------") +with Path("test2.out").open("w") as f: + t1 = clock() + f.write(str(r2)) + t2 = clock() + ttime = t2 - t1 +Path("test2.out").unlink() +print(f"Print time: {ttime:.3f} Rows/s: {reclen / (ttime + delta):.0f}", end=' ') +print(f" Speed-up: {origtime / ttime:.3f}") +print() diff --git a/bench/search-bench-plot.py b/bench/search-bench-plot.py new file mode 100644 index 0000000..0e8dbf3 --- /dev/null +++ b/bench/search-bench-plot.py @@ -0,0 +1,147 @@ +import tables as tb +from pylab import * + + +def get_values(filename, complib=''): + f = tb.open_file(filename) + nrows = f.root.small.create_best.cols.nrows[:] + corrected_sizes = nrows / 10 ** 6 + if mb_units: + corrected_sizes = 16 * nrows / 10 ** 6 + if insert: + values = corrected_sizes / f.root.small.create_best.cols.tfill[:] + if table_size: + values = f.root.small.create_best.cols.fsize[:] / nrows + if query: + values = corrected_sizes / \ + f.root.small.search_best.inkernel.int.cols.time1[:] + if query_cache: + values = corrected_sizes / \ + f.root.small.search_best.inkernel.int.cols.time2[:] + + f.close() + return nrows, values + + +def show_plot(plots, yaxis, legends, gtitle): + xlabel('Number of rows') + ylabel(yaxis) + xlim(10 ** 3, 10 ** 8) + title(gtitle) + grid(True) + +# legends = [f[f.find('-'):f.index('.out')] for f in filenames] +# legends = [l.replace('-', ' ') for l in legends] + if table_size: + legend([p[0] for p in plots], legends, loc="upper right") + else: + legend([p[0] for p in plots], legends, loc="upper left") + + #subplots_adjust(bottom=0.2, top=None, wspace=0.2, hspace=0.2) + if outfile: + savefig(outfile) + else: + show() + +if __name__ == '__main__': + + import sys + import getopt + + usage = """usage: %s [-o file] [-t title] [--insert] [--table-size] [--query] [--query-cache] [--MB-units] files + -o filename for output (only .png and .jpg extensions supported) + -t title of the plot + --insert -- Insert time for table + --table-size -- Size of table + --query -- Time for querying the integer column + --query-cache -- Time for querying the integer (cached) + --MB-units -- Express speed in MB/s instead of MRows/s + \n""" % sys.argv[0] + + try: + opts, pargs = getopt.getopt(sys.argv[1:], 'o:t:', + ['insert', + 'table-size', + 'query', + 'query-cache', + 'MB-units', + ]) + except: + sys.stderr.write(usage) + sys.exit(0) + + progname = sys.argv[0] + args = sys.argv[1:] + + # if we pass too few parameters, abort + if len(pargs) < 1: + sys.stderr.write(usage) + sys.exit(0) + + # default options + outfile = None + insert = 0 + table_size = 0 + query = 0 + query_cache = 0 + mb_units = 0 + yaxis = "No axis name" + tit = None + gtitle = "Please set a title!" + + # Get the options + for option in opts: + if option[0] == '-o': + outfile = option[1] + elif option[0] == '-t': + tit = option[1] + elif option[0] == '--insert': + insert = 1 + yaxis = "MRows/s" + gtitle = "Writing with small (16 bytes) record size" + elif option[0] == '--table-size': + table_size = 1 + yaxis = "Bytes/row" + gtitle = ("Disk space taken by a record (original record size: " + "16 bytes)") + elif option[0] == '--query': + query = 1 + yaxis = "MRows/s" + gtitle = ("Selecting with small (16 bytes) record size (file not " + "in cache)") + elif option[0] == '--query-cache': + query_cache = 1 + yaxis = "MRows/s" + gtitle = ("Selecting with small (16 bytes) record size (file in " + "cache)") + elif option[0] == '--MB-units': + mb_units = 1 + + filenames = pargs + + if mb_units and yaxis == "MRows/s": + yaxis = "MB/s" + + if tit: + gtitle = tit + + plots = [] + legends = [] + for filename in filenames: + plegend = filename[filename.find('cl-') + 3:filename.index('.h5')] + plegend = plegend.replace('-', ' ') + xval, yval = get_values(filename, '') + print(f"Values for {filename} --> {xval}, {yval}") + #plots.append(loglog(xval, yval, linewidth=5)) + plots.append(semilogx(xval, yval, linewidth=4)) + legends.append(plegend) + if 0: # Per a introduir dades simulades si es vol... + xval = [1000, 10_000, 100_000, 1_000_000, 10_000_000, + 100_000_000, 1_000_000_000] +# yval = [0.003, 0.005, 0.02, 0.06, 1.2, +# 40, 210] + yval = [0.0009, 0.0011, 0.0022, 0.005, 0.02, + 0.2, 5.6] + plots.append(loglog(xval, yval, linewidth=5)) + legends.append("PyTables Std") + show_plot(plots, yaxis, legends, gtitle) diff --git a/bench/search-bench-rnd.sh b/bench/search-bench-rnd.sh new file mode 100755 index 0000000..db24750 --- /dev/null +++ b/bench/search-bench-rnd.sh @@ -0,0 +1,122 @@ +#!/bin/sh +# I don't know why, but the /usr/bin/python2.3 from Debian is a 30% slower +# than my own compiled version! 2004-08-18 +python="/usr/local/bin/python2.3 -O" + +writedata () { + nrows=$1 + bfile=$2 + worst=$3 + psyco=$4 + if [ "$shuffle" = "1" ]; then + shufflef="-S" + else + shufflef="" + fi + cmd="${python} search-bench.py -R ${worst} -b ${bfile} -h ${psyco} -l ${libcomp} -c ${complevel} ${shufflef} -w -n ${nrows} data.nobackup/bench-${libcomp}-${nrows}k.h5" + echo ${cmd} + ${cmd} +} + +readdata () { + nrows=$1 + bfile=$2 + worst=$3 + psyco=$4 + smode=$5 + + if [ "$smode" = "indexed" ]; then + #repeats=100 + repeats=20 + else + repeats=2 + fi + cmd="${python} search-bench.py -R ${worst} -h -b ${bfile} ${psyco} -m ${smode} -r -k ${repeats} data.nobackup/bench-${libcomp}-${nrows}k.h5" + echo ${cmd} + ${cmd} + return +} + +overwrite=0 +if [ $# > 1 ]; then + if [ "$1" = "-o" ]; then + overwrite=1 + fi +fi +if [ $# > 2 ]; then + psyco=$2 +fi + +# Configuration for testing +#nrowslist="50000" +#nrowslistworst="50000" + +# Normal test +#nrowslist="1 2 5 10 20 50 100 200 500 1000 2000 5000 10000 20000" +#nrowslistworst="1 2 5 10 20 50 100 200 500 1000 2000 5000 10000 20000" +nrowslist="1 2 5 10 20 50 100 200 500 1000" +nrowslistworst="1 2 5 10 20 50 100 200 500 1000" +#nrowslist="1 2 5 10" +#nrowslistworst="1 2 5 10" + +# The next can be regarded as parameters +shuffle=1 + +for libcomp in none zlib lzo; do +#for libcomp in none lzo; do + if [ "$libcomp" = "none" ]; then + complevel=0 + else + complevel=1 + fi + # The name of the data bench file + bfile="worst-dbench-cl-${libcomp}-c${complevel}-S${shuffle}.h5" + + # Move out a possible previous benchmark file + bn=`basename $bfile ".h5"` + mv -f ${bn}-bck2.h5 ${bn}-bck3.h5 + mv -f ${bn}-bck.h5 ${bn}-bck2.h5 + if [ "$overwrite" = "1" ]; then + echo "moving ${bn}.h5 to ${bn}-bck.h5" + mv -f ${bn}.h5 ${bn}-bck.h5 + else + echo "copying ${bn}.h5 to ${bn}-bck.h5" + cp -f ${bn}.h5 ${bn}-bck.h5 + fi + for worst in "" -t; do + #for worst in ""; do + # Write data files + if [ "$worst" = "-t" ]; then + echo + echo "++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++" + echo "Entering worst case..." + echo "++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++" + echo + nrowslist=$nrowslistworst + fi + # Write data file + for nrows in $nrowslist; do + echo "*************************************************************" + echo "Writing for nrows=$nrows Krows, psyco=$psyco, worst='${worst}'" + echo "*************************************************************" + writedata ${nrows} ${bfile} "${worst}" "${psyco}" + done + # Read data files + for smode in indexed inkernel standard; do + ${python} cacheout.py + for nrows in $nrowslist; do + echo "***********************************************************" + echo "Searching for nrows=$nrows Krows, $smode, psyco=$psyco, worst='${worst}'" + echo "***********************************************************" + readdata ${nrows} ${bfile} "${worst}" "${psyco}" "${smode}" + done + done + # Finally, after the final search, delete the source (if desired) +# for nrows in $nrowslist; do +# rm -f data.nobackup/bench-${libcomp}-${nrows}k.h5 +# done + done + echo "New data available on: $bfile" +done + +exit 0 diff --git a/bench/search-bench.py b/bench/search-bench.py new file mode 100644 index 0000000..66343e3 --- /dev/null +++ b/bench/search-bench.py @@ -0,0 +1,515 @@ +#!/usr/bin/env python + +import random +import sys +import warnings +from pathlib import Path +from time import perf_counter as clock +from time import process_time as cpuclock + +import numpy as np +import tables as tb + +# Initialize the random generator always with the same integer +# in order to have reproductible results +random.seed(19) +np.random.seed(19) + +randomvalues = 0 +worst = 0 + +Small = { + "var1": tb.StringCol(itemsize=4, dflt="Hi!", pos=2), + "var2": tb.Int32Col(pos=1), + "var3": tb.Float64Col(pos=0), + #"var4" : BoolCol(), +} + + +def createNewBenchFile(bfile, verbose): + + class Create(tb.IsDescription): + nrows = tb.Int32Col(pos=0) + irows = tb.Int32Col(pos=1) + tfill = tb.Float64Col(pos=2) + tidx = tb.Float64Col(pos=3) + tcfill = tb.Float64Col(pos=4) + tcidx = tb.Float64Col(pos=5) + rowsecf = tb.Float64Col(pos=6) + rowseci = tb.Float64Col(pos=7) + fsize = tb.Float64Col(pos=8) + isize = tb.Float64Col(pos=9) + psyco = tb.BoolCol(pos=10) + + class Search(tb.IsDescription): + nrows = tb.Int32Col(pos=0) + rowsel = tb.Int32Col(pos=1) + time1 = tb.Float64Col(pos=2) + time2 = tb.Float64Col(pos=3) + tcpu1 = tb.Float64Col(pos=4) + tcpu2 = tb.Float64Col(pos=5) + rowsec1 = tb.Float64Col(pos=6) + rowsec2 = tb.Float64Col(pos=7) + psyco = tb.BoolCol(pos=8) + + if verbose: + print("Creating a new benchfile:", bfile) + # Open the benchmarking file + bf = tb.open_file(bfile, "w") + # Create groups + for recsize in ["small"]: + group = bf.create_group("/", recsize, recsize + " Group") + # Attach the row size of table as attribute + if recsize == "small": + group._v_attrs.rowsize = 16 + # Create a Table for writing bench + bf.create_table(group, "create_best", Create, "best case") + bf.create_table(group, "create_worst", Create, "worst case") + for case in ["best", "worst"]: + # create a group for searching bench (best case) + groupS = bf.create_group(group, "search_" + case, "Search Group") + # Create Tables for searching + for mode in ["indexed", "inkernel", "standard"]: + groupM = bf.create_group(groupS, mode, mode + " Group") + # for searching bench + # for atom in ["string", "int", "float", "bool"]: + for atom in ["string", "int", "float"]: + bf.create_table(groupM, atom, Search, atom + " bench") + bf.close() + + +def createFile(filename, nrows, filters, index, heavy, noise, verbose): + + # Open a file in "w"rite mode + fileh = tb.open_file(filename, mode="w", title="Searchsorted Benchmark", + filters=filters) + rowswritten = 0 + + # Create the test table + table = fileh.create_table(fileh.root, 'table', Small, "test table", + None, nrows) + + t1 = clock() + cpu1 = cpuclock() + nrowsbuf = table.nrowsinbuf + minimum = 0 + maximum = nrows + for i in range(0, nrows, nrowsbuf): + if i + nrowsbuf > nrows: + j = nrows + else: + j = i + nrowsbuf + if randomvalues: + var3 = np.random.uniform(minimum, maximum, size=j - i) + else: + var3 = np.arange(i, j, dtype=np.float64) + if noise > 0: + var3 += np.random.uniform(-noise, noise, size=j - i) + var2 = np.array(var3, dtype=np.int32) + var1 = np.empty(shape=[j - i], dtype="S4") + if not heavy: + var1[:] = var2 + table.append([var3, var2, var1]) + table.flush() + rowswritten += nrows + time1 = clock() - t1 + tcpu1 = cpuclock() - cpu1 + print( + f"Time for filling: {time1:.3f} Krows/s: {nrows / 1000 / time1:.3f}", + end=' ') + fileh.close() + size1 = Path(filename).stat().st_size + print(f", File size: {size1 / 1024 / 1024:.3f} MB") + fileh = tb.open_file(filename, mode="a", title="Searchsorted Benchmark", + filters=filters) + table = fileh.root.table + rowsize = table.rowsize + if index: + t1 = clock() + cpu1 = cpuclock() + # Index all entries + if not heavy: + indexrows = table.cols.var1.create_index(filters=filters) + for colname in ['var2', 'var3']: + table.colinstances[colname].create_index(filters=filters) + time2 = clock() - t1 + tcpu2 = cpuclock() - cpu1 + print( + f"Time for indexing: {time2:.3f} " + f"iKrows/s: {indexrows / 1000 / time2:.3f}", + end=' ') + else: + indexrows = 0 + time2 = 0.000_000_000_1 # an ugly hack + tcpu2 = 0 + + if verbose: + if index: + idx = table.cols.var1.index + print("Index parameters:", repr(idx)) + else: + print("NOT indexing rows") + # Close the file + fileh.close() + + size2 = Path(filename).stat().st_size - size1 + if index: + print(f", Index size: {size2 / 1024 / 1024:.3f} MB") + return (rowswritten, indexrows, rowsize, time1, time2, + tcpu1, tcpu2, size1, size2) + + +def benchCreate(file, nrows, filters, index, bfile, heavy, + psyco, noise, verbose): + + # Open the benchfile in append mode + bf = tb.open_file(bfile, "a") + recsize = "small" + if worst: + table = bf.get_node("/" + recsize + "/create_worst") + else: + table = bf.get_node("/" + recsize + "/create_best") + + (rowsw, irows, rowsz, time1, time2, tcpu1, tcpu2, size1, size2) = \ + createFile(file, nrows, filters, index, heavy, noise, verbose) + # Collect data + table.row["nrows"] = rowsw + table.row["irows"] = irows + table.row["tfill"] = time1 + table.row["tidx"] = time2 + table.row["tcfill"] = tcpu1 + table.row["tcidx"] = tcpu2 + table.row["fsize"] = size1 + table.row["isize"] = size2 + table.row["psyco"] = psyco + print(f"Rows written: {rowsw} Row size: {rowsz}") + print( + f"Time writing rows: {time1} s (real) " + f"{tcpu1} s (cpu) {tcpu1 / time1:.0%}") + rowsecf = rowsw / time1 + table.row["rowsecf"] = rowsecf + print(f"Total file size: {(size1 + size2) / 1024 / 1024:.3f} MB", end=' ') + print(f", Write KB/s (pure data): {rowsw * rowsz / (time1 * 1024):.0f}") + print(f"Rows indexed: {irows} (IMRows): {irows / 10 ** 6}") + print( + f"Time indexing rows: {time2:.3f} s (real) " + f"{tcpu2:.3f} s (cpu) {tcpu2 / time2:.0%}") + rowseci = irows / time2 + table.row["rowseci"] = rowseci + table.row.append() + bf.close() + + +def readFile(filename, atom, riter, indexmode, dselect, verbose): + # Open the HDF5 file in read-only mode + + fileh = tb.open_file(filename, mode="r") + table = fileh.root.table + var1 = table.cols.var1 + var2 = table.cols.var2 + var3 = table.cols.var3 + if indexmode == "indexed": + if var2.index.nelements > 0: + where = table._whereIndexed + else: + warnings.warn( + "Not indexed table or empty index. Defaulting to in-kernel " + "selection") + indexmode = "inkernel" + where = table._whereInRange + elif indexmode == "inkernel": + where = table.where + if verbose: + print("Max rows in buf:", table.nrowsinbuf) + print("Rows in", table._v_pathname, ":", table.nrows) + print("Buffersize:", table.rowsize * table.nrowsinbuf) + print("MaxTuples:", table.nrowsinbuf) + if indexmode == "indexed": + print("Chunk size:", var2.index.sorted.chunksize) + print("Number of elements per slice:", var2.index.nelemslice) + print("Slice number in", table._v_pathname, ":", var2.index.nrows) + + #table.nrowsinbuf = 10 + # print "nrowsinbuf-->", table.nrowsinbuf + rowselected = 0 + time2 = 0 + tcpu2 = 0 + results = [] + print("Select mode:", indexmode, ". Selecting for type:", atom) + # Initialize the random generator always with the same integer + # in order to have reproductible results on each read iteration + random.seed(19) + np.random.seed(19) + for i in range(riter): + # The interval for look values at. This is aproximately equivalent to + # the number of elements to select + rnd = np.random.randint(table.nrows) + cpu1 = cpuclock() + t1 = clock() + if atom == "string": + val = str(rnd)[-4:] + if indexmode in ["indexed", "inkernel"]: + results = [p.nrow + for p in where('var1 == val')] + else: + results = [p.nrow for p in table + if p["var1"] == val] + elif atom == "int": + val = rnd + dselect + if indexmode in ["indexed", "inkernel"]: + results = [p.nrow + for p in where('(rnd <= var3) & (var3 < val)')] + else: + results = [p.nrow for p in table + if rnd <= p["var2"] < val] + elif atom == "float": + val = rnd + dselect + if indexmode in ["indexed", "inkernel"]: + t1 = clock() + results = [p.nrow + for p in where('(rnd <= var3) & (var3 < val)')] + else: + results = [p.nrow for p in table + if float(rnd) <= p["var3"] < float(val)] + else: + raise ValueError("Value for atom '%s' not supported." % atom) + rowselected += len(results) + # print "selected values-->", results + if i == 0: + # First iteration + time1 = clock() - t1 + tcpu1 = cpuclock() - cpu1 + else: + if indexmode == "indexed": + # if indexed, wait until the 5th iteration (in order to + # insure that the index is effectively cached) to take times + if i >= 5: + time2 += clock() - t1 + tcpu2 += cpuclock() - cpu1 + else: + time2 += clock() - t1 + tcpu2 += cpuclock() - cpu1 + + if riter > 1: + if indexmode == "indexed" and riter >= 5: + correction = 5 + else: + correction = 1 + time2 = time2 / (riter - correction) + tcpu2 = tcpu2 / (riter - correction) + if verbose and 1: + print("Values that fullfill the conditions:") + print(results) + + #rowsread = table.nrows * riter + rowsread = table.nrows + rowsize = table.rowsize + + # Close the file + fileh.close() + + return (rowsread, rowselected, rowsize, time1, time2, tcpu1, tcpu2) + + +def benchSearch(file, riter, indexmode, bfile, heavy, psyco, dselect, verbose): + + # Open the benchfile in append mode + bf = tb.open_file(bfile, "a") + recsize = "small" + if worst: + tableparent = "/" + recsize + "/search_worst/" + indexmode + "/" + else: + tableparent = "/" + recsize + "/search_best/" + indexmode + "/" + + # Do the benchmarks + if not heavy: + #atomlist = ["string", "int", "float", "bool"] + atomlist = ["string", "int", "float"] + else: + #atomlist = ["int", "float", "bool"] + atomlist = ["int", "float"] + for atom in atomlist: + tablepath = tableparent + atom + table = bf.get_node(tablepath) + (rowsr, rowsel, rowssz, time1, time2, tcpu1, tcpu2) = \ + readFile(file, atom, riter, indexmode, dselect, verbose) + row = table.row + row["nrows"] = rowsr + row["rowsel"] = rowsel + treadrows = time1 + row["time1"] = time1 + treadrows2 = time2 + row["time2"] = time2 + cpureadrows = tcpu1 + row["tcpu1"] = tcpu1 + cpureadrows2 = tcpu2 + row["tcpu2"] = tcpu2 + row["psyco"] = psyco + tratio = cpureadrows / treadrows + tratio2 = cpureadrows2 / treadrows2 if riter > 1 else 0. + tMrows = rowsr / (1000 * 1000.) + sKrows = rowsel / 1000. + if atom == "string": # just to print once + print(f"Rows read: {rowsr} Mread: {tMrows:.6f} Mrows") + print(f"Rows selected: {rowsel} Ksel: {sKrows:.6f} Krows") + print( + f"Time selecting (1st time): {treadrows:.6f} s " + f"(real) {cpureadrows:.6f} s (cpu) {tratio:.0%}") + if riter > 1: + print( + f"Time selecting (cached): {treadrows2:.6f} s " + f"(real) {cpureadrows2:.6f} s (cpu) {tratio2:.0%}") + rowsec1 = rowsr / treadrows + row["rowsec1"] = rowsec1 + print(f"Read Mrows/sec: {rowsec1 / 10 ** 6:.6f} (first time)", end=' ') + if riter > 1: + rowsec2 = rowsr / treadrows2 + row["rowsec2"] = rowsec2 + print(f"{rowsec2 / 10 ** 6:.6f} (cache time)") + else: + print() + # Append the info to the table + row.append() + table.flush() + # Close the benchmark file + bf.close() + + +if __name__ == "__main__": + import getopt + try: + import psyco + psyco_imported = 1 + except: + psyco_imported = 0 + + usage = """usage: %s [-v] [-p] [-R] [-r] [-w] [-c level] [-l complib] [-S] [-F] [-n nrows] [-x] [-b file] [-t] [-h] [-k riter] [-m indexmode] [-N range] [-d range] datafile + -v verbose + -p use "psyco" if available + -R use Random values for filling + -r only read test + -w only write test + -c sets a compression level (do not set it or 0 for no compression) + -l sets the compression library ("zlib", "lzo", "ucl", "bzip2" or "none") + -S activate shuffling filter + -F activate fletcher32 filter + -n set the number of rows in tables (in krows) + -x don't make indexes + -b bench filename + -t worsT searching case + -h heavy benchmark (operations without strings) + -m index mode for reading ("indexed" | "inkernel" | "standard") + -N introduce (uniform) noise within range into the values + -d the interval for look values (int, float) at. Default is 3. + -k number of iterations for reading\n""" % sys.argv[0] + + try: + opts, pargs = getopt.getopt( + sys.argv[1:], 'vpSFRrowxthk:b:c:l:n:m:N:d:') + except: + sys.stderr.write(usage) + sys.exit(0) + + # if we pass too much parameters, abort + if len(pargs) != 1: + sys.stderr.write(usage) + sys.exit(0) + + # default options + dselect = 3 + noise = 0 + verbose = 0 + fieldName = None + testread = 1 + testwrite = 1 + usepsyco = 0 + complevel = 0 + shuffle = 0 + fletcher32 = 0 + complib = "zlib" + nrows = 1000 + index = 1 + heavy = 0 + bfile = "bench.h5" + supported_imodes = ["indexed", "inkernel", "standard"] + indexmode = "inkernel" + riter = 1 + + # Get the options + for option in opts: + if option[0] == '-v': + verbose = 1 + if option[0] == '-p': + usepsyco = 1 + if option[0] == '-R': + randomvalues = 1 + if option[0] == '-S': + shuffle = 1 + if option[0] == '-F': + fletcher32 = 1 + elif option[0] == '-r': + testwrite = 0 + elif option[0] == '-w': + testread = 0 + elif option[0] == '-x': + index = 0 + elif option[0] == '-h': + heavy = 1 + elif option[0] == '-t': + worst = 1 + elif option[0] == '-b': + bfile = option[1] + elif option[0] == '-c': + complevel = int(option[1]) + elif option[0] == '-l': + complib = option[1] + elif option[0] == '-m': + indexmode = option[1] + if indexmode not in supported_imodes: + raise ValueError( + "Indexmode should be any of '%s' and you passed '%s'" % + (supported_imodes, indexmode)) + elif option[0] == '-n': + nrows = int(float(option[1]) * 1000) + elif option[0] == '-N': + noise = float(option[1]) + elif option[0] == '-d': + dselect = float(option[1]) + elif option[0] == '-k': + riter = int(option[1]) + + if worst: + nrows -= 1 # the worst case + + if complib == "none": + # This means no compression at all + complib = "zlib" # just to make PyTables not complaining + complevel = 0 + + # Catch the hdf5 file passed as the last argument + file = pargs[0] + + # Build the Filters instance + filters = tb.Filters(complevel=complevel, complib=complib, + shuffle=shuffle, fletcher32=fletcher32) + + # Create the benchfile (if needed) + if not Path(bfile).exists(): + createNewBenchFile(bfile, verbose) + + if testwrite: + if verbose: + print("Compression level:", complevel) + if complevel > 0: + print("Compression library:", complib) + if shuffle: + print("Suffling...") + if psyco_imported and usepsyco: + psyco.bind(createFile) + benchCreate(file, nrows, filters, index, bfile, heavy, + usepsyco, noise, verbose) + if testread: + if psyco_imported and usepsyco: + psyco.bind(readFile) + benchSearch(file, riter, indexmode, bfile, heavy, usepsyco, + dselect, verbose) diff --git a/bench/search-bench.sh b/bench/search-bench.sh new file mode 100755 index 0000000..402bd77 --- /dev/null +++ b/bench/search-bench.sh @@ -0,0 +1,123 @@ +#!/bin/sh +python="python2.5 -O" + +writedata () { + nrows=$1 + bfile=$2 + heavy=$3 + psyco=$4 + if [ "$shuffle" = "1" ]; then + shufflef="-S" + else + shufflef="" + fi + cmd="${python} search-bench.py -b ${bfile} ${heavy} ${psyco} -l ${libcomp} -c ${complevel} ${shufflef} -w -n ${nrows} -x data.nobackup/bench-${libcomp}-${nrows}k.h5" + echo ${cmd} + ${cmd} +} + +readdata () { + nrows=$1 + bfile=$2 + heavy=$3 + psyco=$4 + smode=$5 + + if [ "$smode" = "indexed" ]; then + repeats=100 + else + repeats=2 + fi + if [ "$heavy" = "-h" -a "$smode" = "standard" ]; then + # For heavy mode don't do a standard search + echo "Skipping the standard search for heavy mode" + else + cmd="${python} search-bench.py -b ${bfile} ${heavy} ${psyco} -m ${smode} -r -k ${repeats} data.nobackup/bench-${libcomp}-${nrows}k.h5" + echo ${cmd} + ${cmd} + fi + if [ "$smode" = "standard" -a "1" = "0" ]; then + # Finally, after the final search, delete the source (if desired) + rm -f data.nobackup/bench-${libcomp}-${nrows}k.h5 + fi + return +} + +overwrite=0 +if [ $# > 1 ]; then + if [ "$1" = "-o" ]; then + overwrite=1 + fi +fi +if [ $# > 2 ]; then + psyco=$2 +fi +# The next can be regarded as parameters +libcomp="lzo" +complevel=1 +shuffle=1 + +# The name of the data bench file +bfile="dbench-cl-${libcomp}-c${complevel}-S${shuffle}.h5" + +# Move out a possible previous benchmark file +bn=`basename $bfile ".h5"` +mv -f ${bn}-bck2.h5 ${bn}-bck3.h5 +mv -f ${bn}-bck.h5 ${bn}-bck2.h5 +if [ "$overwrite" = "1" ]; then + echo "moving ${bn}.h5 to ${bn}-bck.h5" + mv -f ${bn}.h5 ${bn}-bck.h5 +else + echo "copying ${bn}.h5 to ${bn}-bck.h5" + cp -f ${bn}.h5 ${bn}-bck.h5 +fi + +# Configuration for testing +nrowslist="1 2" +nrowslistheavy="5 10" +# This config takes 10 minutes to complete (psyco, zlib) +#nrowslist="1 2 5 10 20 50 100 200 500 1000" +#nrowslistheavy="2000 5000 10000" +#nrowslist="" +#nrowslistheavy="1 2 5 10 20 50 100 200 500 1000 2000 5000 10000 20000 50000 100000" + +# Normal test +#nrowslist="1 2 5 10 20 50 100 200 500 1000 2000 5000 10000" +#nrowslistheavy="20000 50000 100000 200000 500000 1000000" +# Big test +#nrowslist="1 2 5 10 20 50 100 200 500 1000 2000 5000 10000" +#nrowslistheavy="20000 50000 100000 200000 500000 1000000 2000000 5000000" + +for heavy in "" -h; do + # Write data files (light mode) + if [ "$heavy" = "-h" ]; then + echo + echo "++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++" + echo "Entering heavy mode..." + echo "++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++" + echo + nrowslist=$nrowslistheavy + fi + # Write data file + for nrows in $nrowslist; do + echo "*************************************************************" + echo "Writing for nrows=$nrows Krows, psyco=$psyco, heavy='${heavy}'" + echo "*************************************************************" + writedata ${nrows} ${bfile} "${heavy}" "${psyco}" + done + # Read data files + #for smode in indexed inkernel standard; do + for smode in inkernel standard; do +# for smode in indexed; do + ${python} cacheout.py + for nrows in $nrowslist; do + echo "***********************************************************" + echo "Searching for nrows=$nrows Krows, $smode, psyco=$psyco, heavy='${heavy}'" + echo "***********************************************************" + readdata ${nrows} ${bfile} "${heavy}" "${psyco}" "${smode}" + done + done +done + +echo "New data available on: $bfile" +exit 0 diff --git a/bench/searchsorted-bench.py b/bench/searchsorted-bench.py new file mode 100644 index 0000000..d754e4d --- /dev/null +++ b/bench/searchsorted-bench.py @@ -0,0 +1,341 @@ +#!/usr/bin/env python + +from time import perf_counter as clock +from time import process_time as cpuclock + +import tables as tb + + +class Small(tb.IsDescription): + var1 = tb.StringCol(itemsize=4) + var2 = tb.Int32Col() + var3 = tb.Float64Col() + var4 = tb.BoolCol() + +# Define a user record to characterize some kind of particles + + +class Medium(tb.IsDescription): + var1 = tb.StringCol(itemsize=16) # 16-character String + #float1 = Float64Col(dflt=2.3) + #float2 = Float64Col(dflt=2.3) + # zADCcount = Int16Col() # signed short integer + var2 = tb.Int32Col() # signed short integer + var3 = tb.Float64Col() + grid_i = tb.Int32Col() # integer + grid_j = tb.Int32Col() # integer + pressure = tb.Float32Col() # float (single-precision) + energy = tb.Float64Col(shape=2) # double (double-precision) + + +def createFile(filename, nrows, filters, atom, recsize, index, verbose): + + # Open a file in "w"rite mode + fileh = tb.open_file(filename, mode="w", title="Searchsorted Benchmark", + filters=filters) + title = "This is the IndexArray title" + # Create an IndexArray instance + rowswritten = 0 + # Create an entry + klass = {"small": Small, "medium": Medium} + table = fileh.create_table(fileh.root, 'table', klass[recsize], title, + None, nrows) + for i in range(nrows): + #table.row['var1'] = str(i) + #table.row['var2'] = random.randrange(nrows) + table.row['var2'] = i + table.row['var3'] = i + #table.row['var4'] = i % 2 + #table.row['var4'] = i > 2 + table.row.append() + rowswritten += nrows + table.flush() + rowsize = table.rowsize + indexrows = 0 + + # Index one entry: + if index: + if atom == "string": + indexrows = table.cols.var1.create_index() + elif atom == "bool": + indexrows = table.cols.var4.create_index() + elif atom == "int": + indexrows = table.cols.var2.create_index() + elif atom == "float": + indexrows = table.cols.var3.create_index() + else: + raise ValueError("Index type not supported yet") + if verbose: + print("Number of indexed rows:", indexrows) + # Close the file (eventually destroy the extended type) + fileh.close() + + return (rowswritten, rowsize) + + +def readFile(filename, atom, niter, verbose): + # Open the HDF5 file in read-only mode + + fileh = tb.open_file(filename, mode="r") + table = fileh.root.table + print("reading", table) + if atom == "string": + idxcol = table.cols.var1.index + elif atom == "bool": + idxcol = table.cols.var4.index + elif atom == "int": + idxcol = table.cols.var2.index + else: + idxcol = table.cols.var3.index + if verbose: + print("Max rows in buf:", table.nrowsinbuf) + print("Rows in", table._v_pathname, ":", table.nrows) + print("Buffersize:", table.rowsize * table.nrowsinbuf) + print("MaxTuples:", table.nrowsinbuf) + print("Chunk size:", idxcol.sorted.chunksize) + print("Number of elements per slice:", idxcol.nelemslice) + print("Slice number in", table._v_pathname, ":", idxcol.nrows) + + rowselected = 0 + if atom == "string": + for i in range(niter): + #results = [table.row["var3"] for i in table.where(2+i<=table.cols.var2 < 10+i)] + #results = [table.row.nrow() for i in table.where(2<=table.cols.var2 < 10)] + results = [p["var1"] # p.nrow() + for p in table.where(table.cols.var1 == "1111")] +# for p in table.where("1000"<=table.cols.var1<="1010")] + rowselected += len(results) + elif atom == "bool": + for i in range(niter): + results = [p["var2"] # p.nrow() + for p in table.where(table.cols.var4 == 0)] + rowselected += len(results) + elif atom == "int": + for i in range(niter): + #results = [table.row["var3"] for i in table.where(2+i<=table.cols.var2 < 10+i)] + #results = [table.row.nrow() for i in table.where(2<=table.cols.var2 < 10)] + results = [p["var2"] # p.nrow() + # for p in table.where(110*i<=table.cols.var2<110*(i+1))] + # for p in table.where(1000-30", positions) + print("Total iterations in search:", niter) + + rowsread += table.nrows + uncomprBytes += idxcol.sorted.chunksize * niter * idxcol.sorted.itemsize + + results = table.read(coords=positions) + print("results length:", len(results)) + if verbose: + print("Values that fullfill the conditions:") + print(results) + + # Close the file (eventually destroy the extended type) + fileh.close() + + return (rowsread, uncomprBytes, niter) + + +if __name__ == "__main__": + import sys + import getopt + try: + import psyco + psyco_imported = 1 + except: + psyco_imported = 0 + + usage = """usage: %s [-v] [-p] [-R range] [-r] [-w] [-s recsize ] [-a + atom] [-c level] [-l complib] [-S] [-F] [-i item] [-n nrows] [-x] + [-k niter] file + -v verbose + -p use "psyco" if available + -R select a range in a field in the form "start,stop,step" + -r only read test + -w only write test + -s record size + -a use [float], [int], [bool] or [string] atom + -c sets a compression level (do not set it or 0 for no compression) + -S activate shuffling filter + -F activate fletcher32 filter + -l sets the compression library to be used ("zlib", "lzo", "ucl", "bzip2") + -i item to search + -n set the number of rows in tables + -x don't make indexes + -k number of iterations for reading\n""" % sys.argv[0] + + try: + opts, pargs = getopt.getopt(sys.argv[1:], 'vpSFR:rwxk:s:a:c:l:i:n:') + except: + sys.stderr.write(usage) + sys.exit(0) + + # if we pass too much parameters, abort + if len(pargs) != 1: + sys.stderr.write(usage) + sys.exit(0) + + # default options + verbose = 0 + rng = None + item = None + atom = "int" + fieldName = None + testread = 1 + testwrite = 1 + usepsyco = 0 + complevel = 0 + shuffle = 0 + fletcher32 = 0 + complib = "zlib" + nrows = 100 + recsize = "small" + index = 1 + niter = 1 + + # Get the options + for option in opts: + if option[0] == '-v': + verbose = 1 + if option[0] == '-p': + usepsyco = 1 + if option[0] == '-S': + shuffle = 1 + if option[0] == '-F': + fletcher32 = 1 + elif option[0] == '-R': + rng = [int(i) for i in option[1].split(",")] + elif option[0] == '-r': + testwrite = 0 + elif option[0] == '-w': + testread = 0 + elif option[0] == '-x': + index = 0 + elif option[0] == '-s': + recsize = option[1] + elif option[0] == '-a': + atom = option[1] + if atom not in ["float", "int", "bool", "string"]: + sys.stderr.write(usage) + sys.exit(0) + elif option[0] == '-c': + complevel = int(option[1]) + elif option[0] == '-l': + complib = option[1] + elif option[0] == '-i': + item = eval(option[1]) + elif option[0] == '-n': + nrows = int(option[1]) + elif option[0] == '-k': + niter = int(option[1]) + + # Build the Filters instance + filters = tb.Filters(complevel=complevel, complib=complib, + shuffle=shuffle, fletcher32=fletcher32) + + # Catch the hdf5 file passed as the last argument + file = pargs[0] + + if testwrite: + print("Compression level:", complevel) + if complevel > 0: + print("Compression library:", complib) + if shuffle: + print("Suffling...") + t1 = clock() + cpu1 = cpuclock() + if psyco_imported and usepsyco: + psyco.bind(createFile) + (rowsw, rowsz) = createFile(file, nrows, filters, + atom, recsize, index, verbose) + t2 = clock() + cpu2 = cpuclock() + tapprows = t2 - t1 + cpuapprows = cpu2 - cpu1 + print(f"Rows written:", rowsw, " Row size:", rowsz) + print( + f"Time writing rows: {tapprows:.3f} s (real) " + f"{cpuapprows:.3f} s (cpu) {cpuapprows / tapprows:.0%}") + print(f"Write rows/sec: {rowsw / tapprows:.0f}") + print(f"Write KB/s : {rowsw * rowsz / tapprows / 1024:.0f}") + + if testread: + if psyco_imported and usepsyco: + psyco.bind(readFile) + psyco.bind(searchFile) + t1 = clock() + cpu1 = cpuclock() + if rng or item: + (rowsr, uncomprB, niter) = searchFile(file, atom, verbose, item) + else: + for i in range(1): + (rowsr, rowsel, rowsz) = readFile(file, atom, niter, verbose) + t2 = clock() + cpu2 = cpuclock() + treadrows = t2 - t1 + cpureadrows = cpu2 - cpu1 + tMrows = rowsr / 1000 / 1000 + sKrows = rowsel / 1000 + print(f"Rows read: {rowsr} Mread: {tMrows:.3f} Mrows") + print(f"Rows selected: {rowsel} Ksel: {sKrows:.3f} Krows") + print( + f"Time reading rows: {treadrows:.3f} s (real) " + f"{cpureadrows:.3f} s (cpu) {cpureadrows / treadrows:.0%}") + print(f"Read Mrows/sec: {tMrows / treadrows:.3f}") + # print "Read KB/s :", int(rowsr * rowsz / (treadrows * 1024)) +# print "Uncompr MB :", int(uncomprB / (1024 * 1024)) +# print "Uncompr MB/s :", int(uncomprB / (treadrows * 1024 * 1024)) +# print "Total chunks uncompr :", int(niter) diff --git a/bench/searchsorted-bench2.py b/bench/searchsorted-bench2.py new file mode 100644 index 0000000..02a144b --- /dev/null +++ b/bench/searchsorted-bench2.py @@ -0,0 +1,341 @@ +#!/usr/bin/env python + +from time import perf_counter as clock +from time import process_time as cpuclock + +import tables as tb + + +class Small(tb.IsDescription): + var1 = tb.StringCol(itemsize=4) + var2 = tb.Int32Col() + var3 = tb.Float64Col() + var4 = tb.BoolCol() + +# Define a user record to characterize some kind of particles + + +class Medium(tb.IsDescription): + var1 = tb.StringCol(itemsize=16, dflt="") # 16-character String + #float1 = Float64Col(dflt=2.3) + #float2 = Float64Col(dflt=2.3) + # zADCcount = Int16Col() # signed short integer + var2 = tb.Int32Col() # signed short integer + var3 = tb.Float64Col() + grid_i = tb.Int32Col() # integer + grid_j = tb.Int32Col() # integer + pressure = tb.Float32Col() # float (single-precision) + energy = tb.Float64Col(shape=2) # double (double-precision) + + +def createFile(filename, nrows, filters, atom, recsize, index, verbose): + + # Open a file in "w"rite mode + fileh = tb.open_file(filename, mode="w", title="Searchsorted Benchmark", + filters=filters) + title = "This is the IndexArray title" + # Create an IndexArray instance + rowswritten = 0 + # Create an entry + klass = {"small": Small, "medium": Medium} + table = fileh.create_table(fileh.root, 'table', klass[recsize], title, + None, nrows) + for i in range(nrows): + #table.row['var1'] = str(i) + #table.row['var2'] = random.randrange(nrows) + table.row['var2'] = i + table.row['var3'] = i + #table.row['var4'] = i % 2 + table.row['var4'] = i > 2 + table.row.append() + rowswritten += nrows + table.flush() + rowsize = table.rowsize + indexrows = 0 + + # Index one entry: + if index: + if atom == "string": + indexrows = table.cols.var1.create_index() + elif atom == "bool": + indexrows = table.cols.var4.create_index() + elif atom == "int": + indexrows = table.cols.var2.create_index() + elif atom == "float": + indexrows = table.cols.var3.create_index() + else: + raise ValueError("Index type not supported yet") + if verbose: + print("Number of indexed rows:", indexrows) + # Close the file (eventually destroy the extended type) + fileh.close() + + return (rowswritten, rowsize) + + +def readFile(filename, atom, niter, verbose): + # Open the HDF5 file in read-only mode + + fileh = tb.open_file(filename, mode="r") + table = fileh.root.table + print("reading", table) + if atom == "string": + idxcol = table.cols.var1.index + elif atom == "bool": + idxcol = table.cols.var4.index + elif atom == "int": + idxcol = table.cols.var2.index + else: + idxcol = table.cols.var3.index + if verbose: + print("Max rows in buf:", table.nrowsinbuf) + print("Rows in", table._v_pathname, ":", table.nrows) + print("Buffersize:", table.rowsize * table.nrowsinbuf) + print("MaxTuples:", table.nrowsinbuf) + print("Chunk size:", idxcol.sorted.chunksize) + print("Number of elements per slice:", idxcol.nelemslice) + print("Slice number in", table._v_pathname, ":", idxcol.nrows) + + rowselected = 0 + if atom == "string": + for i in range(niter): + #results = [table.row["var3"] for i in table(where=2+i<=table.cols.var2 < 10+i)] + #results = [table.row.nrow() for i in table(where=2<=table.cols.var2 < 10)] + results = [p["var1"] # p.nrow() + for p in table(where=table.cols.var1 == "1111")] +# for p in table(where="1000"<=table.cols.var1<="1010")] + rowselected += len(results) + elif atom == "bool": + for i in range(niter): + results = [p["var2"] # p.nrow() + for p in table(where=table.cols.var4 == 0)] + rowselected += len(results) + elif atom == "int": + for i in range(niter): + #results = [table.row["var3"] for i in table(where=2+i<=table.cols.var2 < 10+i)] + #results = [table.row.nrow() for i in table(where=2<=table.cols.var2 < 10)] + results = [p["var2"] # p.nrow() + # for p in table(where=110*i<=table.cols.var2<110*(i+1))] + # for p in table(where=1000-30", positions) + print("Total iterations in search:", niter) + + rowsread += table.nrows + uncomprBytes += idxcol.sorted.chunksize * niter * idxcol.sorted.itemsize + + results = table.read(coords=positions) + print("results length:", len(results)) + if verbose: + print("Values that fullfill the conditions:") + print(results) + + # Close the file (eventually destroy the extended type) + fileh.close() + + return (rowsread, uncomprBytes, niter) + + +if __name__ == "__main__": + import sys + import getopt + try: + import psyco + psyco_imported = 1 + except: + psyco_imported = 0 + + usage = """usage: %s [-v] [-p] [-R range] [-r] [-w] [-s recsize ] [-a + atom] [-c level] [-l complib] [-S] [-F] [-i item] [-n nrows] [-x] + [-k niter] file + -v verbose + -p use "psyco" if available + -R select a range in a field in the form "start,stop,step" + -r only read test + -w only write test + -s record size + -a use [float], [int], [bool] or [string] atom + -c sets a compression level (do not set it or 0 for no compression) + -S activate shuffling filter + -F activate fletcher32 filter + -l sets the compression library to be used ("zlib", "lzo", "ucl", "bzip2") + -i item to search + -n set the number of rows in tables + -x don't make indexes + -k number of iterations for reading\n""" % sys.argv[0] + + try: + opts, pargs = getopt.getopt(sys.argv[1:], 'vpSFR:rwxk:s:a:c:l:i:n:') + except: + sys.stderr.write(usage) + sys.exit(0) + + # if we pass too much parameters, abort + if len(pargs) != 1: + sys.stderr.write(usage) + sys.exit(0) + + # default options + verbose = 0 + rng = None + item = None + atom = "int" + fieldName = None + testread = 1 + testwrite = 1 + usepsyco = 0 + complevel = 0 + shuffle = 0 + fletcher32 = 0 + complib = "zlib" + nrows = 100 + recsize = "small" + index = 1 + niter = 1 + + # Get the options + for option in opts: + if option[0] == '-v': + verbose = 1 + if option[0] == '-p': + usepsyco = 1 + if option[0] == '-S': + shuffle = 1 + if option[0] == '-F': + fletcher32 = 1 + elif option[0] == '-R': + rng = [int(i) for i in option[1].split(",")] + elif option[0] == '-r': + testwrite = 0 + elif option[0] == '-w': + testread = 0 + elif option[0] == '-x': + index = 0 + elif option[0] == '-s': + recsize = option[1] + elif option[0] == '-a': + atom = option[1] + if atom not in ["float", "int", "bool", "string"]: + sys.stderr.write(usage) + sys.exit(0) + elif option[0] == '-c': + complevel = int(option[1]) + elif option[0] == '-l': + complib = option[1] + elif option[0] == '-i': + item = eval(option[1]) + elif option[0] == '-n': + nrows = int(option[1]) + elif option[0] == '-k': + niter = int(option[1]) + + # Build the Filters instance + filters = tb.Filters(complevel=complevel, complib=complib, + shuffle=shuffle, fletcher32=fletcher32) + + # Catch the hdf5 file passed as the last argument + file = pargs[0] + + if testwrite: + print("Compression level:", complevel) + if complevel > 0: + print("Compression library:", complib) + if shuffle: + print("Suffling...") + t1 = clock() + cpu1 = cpuclock() + if psyco_imported and usepsyco: + psyco.bind(createFile) + (rowsw, rowsz) = createFile(file, nrows, filters, + atom, recsize, index, verbose) + t2 = clock() + cpu2 = cpuclock() + tapprows = t2 - t1 + cpuapprows = cpu2 - cpu1 + print(f"Rows written: {rowsw} Row size: {rowsz}") + print( + f"Time writing rows: {tapprows:.3f} s (real) " + f"{cpuapprows:.3f} s (cpu) {cpuapprows / tapprows:.0%}") + print(f"Write rows/sec: {rowsw / tapprows:.0f}") + print(f"Write KB/s : {rowsw * rowsz / (tapprows * 1024):.0f}") + + if testread: + if psyco_imported and usepsyco: + psyco.bind(readFile) + psyco.bind(searchFile) + t1 = clock() + cpu1 = cpuclock() + if rng or item: + (rowsr, uncomprB, niter) = searchFile(file, atom, verbose, item) + else: + for i in range(1): + (rowsr, rowsel, rowsz) = readFile(file, atom, niter, verbose) + t2 = clock() + cpu2 = cpuclock() + treadrows = t2 - t1 + cpureadrows = cpu2 - cpu1 + tMrows = rowsr / 1000 / 1000 + sKrows = rowsel / 1000 + print(f"Rows read: {rowsr} Mread: {tMrows:.3f} Mrows") + print(f"Rows selected: {rowsel} Ksel: {sKrows:.3f} Krows") + print( + f"Time reading rows: {treadrows:.3f} s (real) " + f"{cpureadrows:.3f} s (cpu) {cpureadrows / treadrows:.0%}") + print(f"Read Mrows/sec: {tMrows / treadrows:.3f}") + # print "Read KB/s :", int(rowsr * rowsz / (treadrows * 1024)) +# print "Uncompr MB :", int(uncomprB / (1024 * 1024)) +# print "Uncompr MB/s :", int(uncomprB / (treadrows * 1024 * 1024)) +# print "Total chunks uncompr :", int(niter) diff --git a/bench/shelve-bench.py b/bench/shelve-bench.py new file mode 100644 index 0000000..53a08a6 --- /dev/null +++ b/bench/shelve-bench.py @@ -0,0 +1,198 @@ +#!/usr/bin/env python + +import struct +import sys +import shelve +import numpy as np +import tables as tb +import psyco + +# This class is accessible only for the examples + + +class Small(tb.IsDescription): + + """Record descriptor. + + A record has several columns. They are represented here as class + attributes, whose names are the column names and their values will + become their types. The IsDescription class will take care the user + will not add any new variables and that its type is correct. + + """ + + var1 = tb.StringCol(itemsize=4) + var2 = tb.Int32Col() + var3 = tb.Float64Col() + +# Define a user record to characterize some kind of particles + + +class Medium(tb.IsDescription): + name = tb.StringCol(itemsize=16) # 16-character String + float1 = tb.Float64Col(shape=2, dflt=2.3) + #float1 = Float64Col(dflt=1.3) + #float2 = Float64Col(dflt=2.3) + ADCcount = tb.Int16Col() # signed short integer + grid_i = tb.Int32Col() # integer + grid_j = tb.Int32Col() # integer + pressure = tb.Float32Col() # float (single-precision) + energy = tb.Float64Col() # double (double-precision) + +# Define a user record to characterize some kind of particles + + +class Big(tb.IsDescription): + name = tb.StringCol(itemsize=16) # 16-character String + #float1 = Float64Col(shape=32, dflt=np.arange(32)) + #float2 = Float64Col(shape=32, dflt=np.arange(32)) + float1 = tb.Float64Col(shape=32, dflt=range(32)) + float2 = tb.Float64Col(shape=32, dflt=[2.2] * 32) + ADCcount = tb.Int16Col() # signed short integer + grid_i = tb.Int32Col() # integer + grid_j = tb.Int32Col() # integer + pressure = tb.Float32Col() # float (single-precision) + energy = tb.Float64Col() # double (double-precision) + + +def createFile(filename, totalrows, recsize): + + # Open a 'n'ew file + fileh = shelve.open(filename, flag="n") + + rowswritten = 0 + # Get the record object associated with the new table + if recsize == "big": + d = Big() + arr = np.arange(32, dtype=np.float64) + arr2 = np.arange(32, dtype=np.float64) + elif recsize == "medium": + d = Medium() + else: + d = Small() + # print d + # sys.exit(0) + for j in range(3): + # Create a table + # table = fileh.create_table(group, 'tuple'+str(j), Record(), title, + # compress = 6, expectedrows = totalrows) + # Create a Table instance + tablename = 'tuple' + str(j) + table = [] + # Fill the table + if recsize == "big" or recsize == "medium": + for i in range(totalrows): + d.name = 'Particle: %6d' % (i) + #d.TDCcount = i % 256 + d.ADCcount = (i * 256) % (1 << 16) + if recsize == "big": + #d.float1 = np.array([i]*32, np.float64) + #d.float2 = np.array([i**2]*32, np.float64) + arr[0] = 1.1 + d.float1 = arr + arr2[0] = 2.2 + d.float2 = arr2 + pass + else: + d.float1 = np.array([i ** 2] * 2, np.float64) + #d.float1 = float(i) + #d.float2 = float(i) + d.grid_i = i + d.grid_j = 10 - i + d.pressure = float(i * i) + d.energy = float(d.pressure ** 4) + table.append((d.ADCcount, d.energy, d.float1, d.float2, + d.grid_i, d.grid_j, d.name, d.pressure)) + # Only on float case + # table.append((d.ADCcount, d.energy, d.float1, + # d.grid_i, d.grid_j, d.name, d.pressure)) + else: + for i in range(totalrows): + d.var1 = str(i) + d.var2 = i + d.var3 = 12.1e10 + table.append((d.var1, d.var2, d.var3)) + + # Save this table on disk + fileh[tablename] = table + rowswritten += totalrows + + # Close the file + fileh.close() + return (rowswritten, struct.calcsize(d._v_fmt)) + + +def readFile(filename, recsize): + # Open the HDF5 file in read-only mode + fileh = shelve.open(filename, "r") + for table in ['tuple0', 'tuple1', 'tuple2']: + if recsize == "big" or recsize == "medium": + e = [t[2] for t in fileh[table] if t[4] < 20] + # if there is only one float (array) + #e = [ t[1] for t in fileh[table] if t[3] < 20 ] + else: + e = [t[1] for t in fileh[table] if t[1] < 20] + + print("resulting selection list ==>", e) + print("Total selected records ==> ", len(e)) + + # Close the file (eventually destroy the extended type) + fileh.close() + + +# Add code to test here +if __name__ == "__main__": + import getopt + from time import perf_counter as clock + + usage = """usage: %s [-f] [-s recsize] [-i iterations] file + -s use [big] record, [medium] or [small] + -i sets the number of rows in each table\n""" % sys.argv[0] + + try: + opts, pargs = getopt.getopt(sys.argv[1:], 's:fi:') + except: + sys.stderr.write(usage) + sys.exit(0) + + # if we pass too much parameters, abort + if len(pargs) != 1: + sys.stderr.write(usage) + sys.exit(0) + + # default options + recsize = "medium" + iterations = 100 + + # Get the options + for option in opts: + if option[0] == '-s': + recsize = option[1] + if recsize not in ["big", "medium", "small"]: + sys.stderr.write(usage) + sys.exit(0) + elif option[0] == '-i': + iterations = int(option[1]) + + # Catch the hdf5 file passed as the last argument + file = pargs[0] + + t1 = clock() + psyco.bind(createFile) + (rowsw, rowsz) = createFile(file, iterations, recsize) + t2 = clock() + tapprows = t2 - t1 + + t1 = clock() + psyco.bind(readFile) + readFile(file, recsize) + t2 = clock() + treadrows = t2 - t1 + + print(f"Rows written: {rowsw} Row size: {rowsz}") + print(f"Time appending rows: {tapprows:.3f}") + print(f"Write rows/sec: {iterations * 3 / tapprows:.0f}") + print(f"Write KB/s : {rowsw * rowsz / (tapprows * 1024):.0f}") + print(f"Time reading rows: {treadrows:.3f}") + print(f"Read rows/sec: {iterations * 3 / treadrows:.0f}") + print(f"Read KB/s : {rowsw * rowsz / (treadrows * 1024):.0f}") diff --git a/bench/split-file.py b/bench/split-file.py new file mode 100644 index 0000000..be25a9b --- /dev/null +++ b/bench/split-file.py @@ -0,0 +1,39 @@ +""" +Split out a monolithic file with many different runs of +indexed_search.py. The resulting files are meant for use in +get-figures.py. + +Usage: python split-file.py prefix filename +""" + +import sys +from pathlib import Path + +prefix = sys.argv[1] +filename = sys.argv[2] +sf = None +for line in Path(filename).read_text().splitlines(): + if line.startswith('Processing database:'): + if sf: + sf.close() + line2 = line.split(':')[1] + # Check if entry is compressed and if has to be processed + line2 = line2[:line2.rfind('.')] + params = line2.split('-') + optlevel = 0 + complib = None + for param in params: + if param[0] == 'O' and param[1].isdigit(): + optlevel = int(param[1]) + elif param[:-1] in ('zlib', 'lzo'): + complib = param + if 'PyTables' in prefix: + if complib: + sfilename = f"{prefix}-O{optlevel}-{complib}.out" + else: + sfilename = f"{prefix}-O{optlevel}.out" + else: + sfilename = f"{prefix}.out" + sf = file(sfilename, 'a') + if sf: + sf.write(line) diff --git a/bench/sqlite-search-bench-rnd.sh b/bench/sqlite-search-bench-rnd.sh new file mode 100755 index 0000000..629d0e4 --- /dev/null +++ b/bench/sqlite-search-bench-rnd.sh @@ -0,0 +1,105 @@ +#!/bin/sh +# I don't know why, but the /usr/bin/python2.3 from Debian is a 30% slower +# than my own compiled version! 2004-08-18 +python="/usr/local/bin/python2.3 -O" + +writedata () { + nrows=$1 + bfile=$2 + smode=$3 + psyco=$4 + cmd="${python} sqlite-search-bench.py -R -h -b ${bfile} ${psyco} -m ${smode} -w -n ${nrows} data.nobackup/sqlite-${nrows}k.h5" + echo ${cmd} + ${cmd} +} + +readdata () { + nrows=$1 + bfile=$2 + smode=$3 + psyco=$4 + + if [ "$smode" = "indexed" ]; then + #repeats=100 + repeats=20 + else + repeats=2 + fi + cmd="${python} sqlite-search-bench.py -R -h -b ${bfile} ${psyco} -n ${nrows} -m ${smode} -r -k ${repeats} data.nobackup/sqlite-${nrows}k.h5" + echo ${cmd} + ${cmd} + # Finally, delete the source (if desired) + if [ "$smode" = "indexed" ]; then + echo "Deleting data file data.nobackup/sqlite-${nrows}k.h5" +# rm -f data.nobackup/sqlite-${nrows}k.h5 + fi + return +} + +overwrite=0 +if [ $# > 1 ]; then + if [ "$1" = "-o" ]; then + overwrite=1 + fi +fi +if [ $# > 2 ]; then + psyco=$2 +fi + +# The name of the data bench file +bfile="sqlite-dbench.h5" + +# Move out a possible previous benchmark file +bn=`basename $bfile ".h5"` +mv -f ${bn}-bck2.h5 ${bn}-bck3.h5 +mv -f ${bn}-bck.h5 ${bn}-bck2.h5 +if [ "$overwrite" = "1" ]; then + echo "moving ${bn}.h5 to ${bn}-bck.h5" + mv -f ${bn}.h5 ${bn}-bck.h5 +else + echo "copying ${bn}.h5 to ${bn}-bck.h5" + cp -f ${bn}.h5 ${bn}-bck.h5 +fi + +# Configuration for testing +nrowsliststd="1 2" +nrowslistidx="1 2" +#nrowsliststd="1 2 5 10 20 50 100 200 500 1000 2000 5000 10000 20000 50000" +#nrowsliststd="1 2 5 10 20" +#nrowslistidx="1 2 5 10 20" +# nrowsliststd="1 2 5 10 20 50 100 200 500 1000 2000 5000 10000" +# nrowslistidx="1 2 5 10 20 50 100 200 500 1000 2000 5000 10000" +#nrowsliststd="1 2 5 10 20 50 100 200 500 1000 2000 5000 10000 20000 50000 100000" +#nrowslistidx="1 2 5 10 20 50 100 200 500 1000 2000 5000 10000 20000 50000 100000" + +for smode in standard indexed; do +#for smode in indexed; do + echo + echo "++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++" + echo "Entering ${smode} mode..." + echo "++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++" + echo + if [ "$smode" = "standard" ]; then + nrowslist=$nrowsliststd + else + nrowslist=$nrowslistidx + fi + # Write data files + for nrows in $nrowslist; do + echo "*************************************************************" + echo "Writing for nrows=$nrows Krows, $smode, psyco=$psyco" + echo "*************************************************************" + writedata ${nrows} ${bfile} "${smode}" "${psyco}" + done + # Read data files + ${python} cacheout.py + for nrows in $nrowslist; do + echo "***********************************************************" + echo "Searching for nrows=$nrows Krows, $smode, psyco=$psyco" + echo "***********************************************************" + readdata ${nrows} ${bfile} "${smode}" "${psyco}" + done +done + +echo "New data available on: $bfile" +exit 0 diff --git a/bench/sqlite-search-bench.py b/bench/sqlite-search-bench.py new file mode 100644 index 0000000..6c37705 --- /dev/null +++ b/bench/sqlite-search-bench.py @@ -0,0 +1,454 @@ +#!/usr/bin/python + +import os +import random +import sqlite3 +import sys +from pathlib import Path +from time import perf_counter as clock +from time import process_time as cpuclock + +import numpy as np +import tables as tb + +randomvalues = 0 +standarddeviation = 10_000 +# Initialize the random generator always with the same integer +# in order to have reproductible results +random.seed(19) +np.random.seed((19, 20)) + +# defaults +psycon = 0 +worst = 0 + + +def createNewBenchFile(bfile, verbose): + + class Create(tb.IsDescription): + nrows = tb.Int32Col(pos=0) + irows = tb.Int32Col(pos=1) + tfill = tb.Float64Col(pos=2) + tidx = tb.Float64Col(pos=3) + tcfill = tb.Float64Col(pos=4) + tcidx = tb.Float64Col(pos=5) + rowsecf = tb.Float64Col(pos=6) + rowseci = tb.Float64Col(pos=7) + fsize = tb.Float64Col(pos=8) + isize = tb.Float64Col(pos=9) + psyco = tb.BoolCol(pos=10) + + class Search(tb.IsDescription): + nrows = tb.Int32Col(pos=0) + rowsel = tb.Int32Col(pos=1) + time1 = tb.Float64Col(pos=2) + time2 = tb.Float64Col(pos=3) + tcpu1 = tb.Float64Col(pos=4) + tcpu2 = tb.Float64Col(pos=5) + rowsec1 = tb.Float64Col(pos=6) + rowsec2 = tb.Float64Col(pos=7) + psyco = tb.BoolCol(pos=8) + + if verbose: + print("Creating a new benchfile:", bfile) + # Open the benchmarking file + bf = tb.open_file(bfile, "w") + # Create groups + for recsize in ["sqlite_small"]: + group = bf.create_group("/", recsize, recsize + " Group") + # Attach the row size of table as attribute + if recsize == "small": + group._v_attrs.rowsize = 16 + # Create a Table for writing bench + bf.create_table(group, "create_indexed", Create, "indexed values") + bf.create_table(group, "create_standard", Create, "standard values") + # create a group for searching bench + groupS = bf.create_group(group, "search", "Search Group") + # Create Tables for searching + for mode in ["indexed", "standard"]: + group = bf.create_group(groupS, mode, mode + " Group") + # for searching bench + # for atom in ["string", "int", "float", "bool"]: + for atom in ["string", "int", "float"]: + bf.create_table(group, atom, Search, atom + " bench") + bf.close() + + +def createFile(filename, nrows, filters, indexmode, heavy, noise, bfile, + verbose): + + # Initialize some variables + t1 = 0 + t2 = 0 + tcpu1 = 0 + tcpu2 = 0 + rowsecf = 0 + rowseci = 0 + size1 = 0 + size2 = 0 + + if indexmode == "standard": + print("Creating a new database:", dbfile) + instd = os.popen("/usr/local/bin/sqlite " + dbfile, "w") + CREATESTD = """ +CREATE TABLE small ( +-- Name Type -- Example +--------------------------------------- +recnum INTEGER PRIMARY KEY, -- 345 +var1 char(4), -- Abronia villosa +var2 INTEGER, -- 111 +var3 FLOAT -- 12.32 +); +""" + CREATEIDX = """ +CREATE TABLE small ( +-- Name Type -- Example +--------------------------------------- +recnum INTEGER PRIMARY KEY, -- 345 +var1 char(4), -- Abronia villosa +var2 INTEGER, -- 111 +var3 FLOAT -- 12.32 +); +CREATE INDEX ivar1 ON small(var1); +CREATE INDEX ivar2 ON small(var2); +CREATE INDEX ivar3 ON small(var3); +""" + # Creating the table first and indexing afterwards is a bit faster + instd.write(CREATESTD) + instd.close() + + conn = sqlite3.connect(dbfile) + cursor = conn.cursor() + if indexmode == "standard": + place_holders = ",".join(['%s'] * 3) + # Insert rows + SQL = "insert into small values(NULL, %s)" % place_holders + time1 = clock() + cpu1 = cpuclock() + # This way of filling is to copy the PyTables benchmark + nrowsbuf = 1000 + minimum = 0 + maximum = nrows + for i in range(0, nrows, nrowsbuf): + if i + nrowsbuf > nrows: + j = nrows + else: + j = i + nrowsbuf + if randomvalues: + var3 = np.random.uniform(minimum, maximum, shape=[j - i]) + else: + var3 = np.arange(i, j, type=np.float64) + if noise: + var3 += np.random.uniform(-3, 3, shape=[j - i]) + var2 = np.array(var3, type=np.int32) + var1 = np.array(None, shape=[j - i], dtype='s4') + if not heavy: + for n in range(j - i): + var1[n] = str("%.4s" % var2[n]) + for n in range(j - i): + fields = (var1[n], var2[n], var3[n]) + cursor.execute(SQL, fields) + conn.commit() + t1 = clock() - time1 + tcpu1 = cpuclock() - cpu1 + rowsecf = nrows / t1 + size1 = Path(dbfile).stat().st_size + print(f"******** Results for writing nrows = {nrows} *********") + print(f"Insert time: {t1:.5f}, KRows/s: {nrows / 1000 / t1:.3f}") + print(f", File size: {size1 / 1024 / 1024:.3f} MB") + + # Indexem + if indexmode == "indexed": + time1 = clock() + cpu1 = cpuclock() + if not heavy: + cursor.execute("CREATE INDEX ivar1 ON small(var1)") + conn.commit() + cursor.execute("CREATE INDEX ivar2 ON small(var2)") + conn.commit() + cursor.execute("CREATE INDEX ivar3 ON small(var3)") + conn.commit() + t2 = clock() - time1 + tcpu2 = cpuclock() - cpu1 + rowseci = nrows / t2 + print(f"Index time: {t2:.5f}, IKRows/s: {nrows / 1000 / t2:.3f}") + size2 = Path(dbfile).stat().st_size - size1 + print(f", Final size with index: {size2 / 1024 / 1024:.3f} MB") + + conn.close() + + # Collect benchmark data + bf = tb.open_file(bfile, "a") + recsize = "sqlite_small" + if indexmode == "indexed": + table = bf.get_node("/" + recsize + "/create_indexed") + else: + table = bf.get_node("/" + recsize + "/create_standard") + table.row["nrows"] = nrows + table.row["irows"] = nrows + table.row["tfill"] = t1 + table.row["tidx"] = t2 + table.row["tcfill"] = tcpu1 + table.row["tcidx"] = tcpu2 + table.row["psyco"] = psycon + table.row["rowsecf"] = rowsecf + table.row["rowseci"] = rowseci + table.row["fsize"] = size1 + table.row["isize"] = size2 + table.row.append() + bf.close() + + return + + +def readFile(dbfile, nrows, indexmode, heavy, dselect, bfile, riter): + # Connect to the database. + conn = sqlite3.connect(db=dbfile, mode=755) + # Obtain a cursor + cursor = conn.cursor() + + # select count(*), avg(var2) + SQL1 = """ + select recnum + from small where var1 = %s + """ + SQL2 = """ + select recnum + from small where var2 >= %s and var2 < %s + """ + SQL3 = """ + select recnum + from small where var3 >= %s and var3 < %s + """ + + # Open the benchmark database + bf = tb.open_file(bfile, "a") + # default values for the case that columns are not indexed + t2 = 0 + tcpu2 = 0 + # Some previous computations for the case of random values + if randomvalues: + # algorithm to choose a value separated from mean +# If want to select fewer values, select this +# if nrows/2 > standarddeviation*3: +# Choose five standard deviations away from mean value +# dev = standarddeviation*5 +# dev = standarddeviation*math.log10(nrows/1000.) + + # This algorithm give place to too asymmetric result values +# if standarddeviation*10 < nrows/2: +# Choose four standard deviations away from mean value +# dev = standarddeviation*4 +# else: +# dev = 100 + # Yet Another Algorithm + if nrows / 2 > standarddeviation * 10: + dev = standarddeviation * 4 + elif nrows / 2 > standarddeviation: + dev = standarddeviation * 2 + elif nrows / 2 > standarddeviation / 10: + dev = standarddeviation / 10 + else: + dev = standarddeviation / 100 + + valmax = round(nrows / 2 - dev) + # split the selection range in regular chunks + if riter > valmax * 2: + riter = valmax * 2 + chunksize = (valmax * 2 / riter) * 10 + # Get a list of integers for the intervals + randlist = range(0, valmax, chunksize) + randlist.extend(range(nrows - valmax, nrows, chunksize)) + # expand the list ten times so as to use the cache + randlist = randlist * 10 + # shuffle the list + random.shuffle(randlist) + # reset the value of chunksize + chunksize = chunksize / 10 + # print "chunksize-->", chunksize + # randlist.sort();print "randlist-->", randlist + else: + chunksize = 3 + if heavy: + searchmodelist = ["int", "float"] + else: + searchmodelist = ["string", "int", "float"] + + # Execute queries + for atom in searchmodelist: + time2 = 0 + cpu2 = 0 + rowsel = 0 + for i in range(riter): + rnd = random.randrange(nrows) + time1 = clock() + cpu1 = cpuclock() + if atom == "string": + #cursor.execute(SQL1, "1111") + cursor.execute(SQL1, str(rnd)[-4:]) + elif atom == "int": + #cursor.execute(SQL2 % (rnd, rnd+3)) + cursor.execute(SQL2 % (rnd, rnd + dselect)) + elif atom == "float": + #cursor.execute(SQL3 % (float(rnd), float(rnd+3))) + cursor.execute(SQL3 % (float(rnd), float(rnd + dselect))) + else: + raise ValueError( + "atom must take a value in ['string','int','float']") + if i == 0: + t1 = clock() - time1 + tcpu1 = cpuclock() - cpu1 + else: + if indexmode == "indexed": + # if indexed, wait until the 5th iteration to take + # times (so as to insure that the index is + # effectively cached) + if i >= 5: + time2 += clock() - time1 + cpu2 += cpuclock() - cpu1 + else: + time2 += clock() - time1 + time2 += cpuclock() - cpu1 + if riter > 1: + if indexmode == "indexed" and riter >= 5: + correction = 5 + else: + correction = 1 + t2 = time2 / (riter - correction) + tcpu2 = cpu2 / (riter - correction) + + print( + f"*** Query results for atom = {atom}, " + f"nrows = {nrows}, indexmode = {indexmode} ***") + print(f"Query time: {t1:.5f}, cached time: {t2:.5f}") + print(f"MRows/s: {nrows / 1_000_000 / t1:.3f}", end=' ') + if t2 > 0: + print(f", cached MRows/s: {nrows / 10 ** 6 / t2:.3f}") + else: + print() + + # Collect benchmark data + recsize = "sqlite_small" + tablepath = "/" + recsize + "/search/" + indexmode + "/" + atom + table = bf.get_node(tablepath) + table.row["nrows"] = nrows + table.row["rowsel"] = rowsel + table.row["time1"] = t1 + table.row["time2"] = t2 + table.row["tcpu1"] = tcpu1 + table.row["tcpu2"] = tcpu2 + table.row["psyco"] = psycon + table.row["rowsec1"] = nrows / t1 + if t2 > 0: + table.row["rowsec2"] = nrows / t2 + table.row.append() + table.flush() # Flush the data + + # Close the database + conn.close() + bf.close() # the bench database + + return + +if __name__ == "__main__": + import getopt + try: + import psyco + psyco_imported = 1 + except: + psyco_imported = 0 + + usage = """usage: %s [-v] [-p] [-R] [-h] [-t] [-r] [-w] [-n nrows] [-b file] [-k riter] [-m indexmode] [-N range] datafile + -v verbose + -p use "psyco" if available + -R use Random values for filling + -h heavy mode (exclude strings from timings) + -t worsT searching case (to emulate PyTables worst cases) + -r only read test + -w only write test + -n the number of rows (in krows) + -b bench filename + -N introduce (uniform) noise within range into the values + -d the interval for look values (int, float) at. Default is 3. + -k number of iterations for reading\n""" % sys.argv[0] + + try: + opts, pargs = getopt.getopt(sys.argv[1:], 'vpRhtrwn:b:k:m:N:d:') + except: + sys.stderr.write(usage) + sys.exit(0) + + # if we pass too much parameters, abort + if len(pargs) != 1: + sys.stderr.write(usage) + sys.exit(0) + + # default options + dselect = 3 + noise = 0 + verbose = 0 + heavy = 0 + testread = 1 + testwrite = 1 + usepsyco = 0 + nrows = 1000 + bfile = "sqlite-bench.h5" + supported_imodes = ["indexed", "standard"] + indexmode = "indexed" + riter = 2 + + # Get the options + for option in opts: + if option[0] == '-v': + verbose = 1 + if option[0] == '-p': + usepsyco = 1 + elif option[0] == '-R': + randomvalues = 1 + elif option[0] == '-h': + heavy = 1 + elif option[0] == '-t': + worst = 1 + elif option[0] == '-r': + testwrite = 0 + elif option[0] == '-w': + testread = 0 + elif option[0] == '-b': + bfile = option[1] + elif option[0] == '-N': + noise = float(option[1]) + elif option[0] == '-m': + indexmode = option[1] + if indexmode not in supported_imodes: + raise ValueError( + "Indexmode should be any of '%s' and you passed '%s'" % + (supported_imodes, indexmode)) + elif option[0] == '-n': + nrows = int(float(option[1]) * 1000) + elif option[0] == '-d': + dselect = float(option[1]) + elif option[0] == '-k': + riter = int(option[1]) + + # remaining parameters + dbfile = pargs[0] + + if worst: + nrows -= 1 # the worst case + + # Create the benchfile (if needed) + if not Path(bfile).exists(): + createNewBenchFile(bfile, verbose) + + if testwrite: + if psyco_imported and usepsyco: + psyco.bind(createFile) + psycon = 1 + createFile(dbfile, nrows, None, indexmode, heavy, noise, bfile, + verbose) + + if testread: + if psyco_imported and usepsyco: + psyco.bind(readFile) + psycon = 1 + readFile(dbfile, nrows, indexmode, heavy, dselect, bfile, riter) diff --git a/bench/sqlite-search-bench.sh b/bench/sqlite-search-bench.sh new file mode 100755 index 0000000..4adca89 --- /dev/null +++ b/bench/sqlite-search-bench.sh @@ -0,0 +1,96 @@ +#!/bin/sh +# I don't know why, but the /usr/bin/python2.3 from Debian is a 30% slower +# than my own compiled version! 2004-08-18 +python="/usr/local/bin/python2.3 -O" + +writedata () { + nrows=$1 + bfile=$2 + smode=$3 + psyco=$4 + cmd="${python} sqlite-search-bench.py -b ${bfile} ${psyco} -m ${smode} -w -n ${nrows} data.nobackup/sqlite-${nrows}k-${smode}.h5" + echo ${cmd} + ${cmd} +} + +readdata () { + nrows=$1 + bfile=$2 + smode=$3 + psyco=$4 + + if [ "$smode" = "indexed" ]; then + repeats=100 + else + repeats=2 + fi + cmd="${python} sqlite-search-bench.py -b ${bfile} ${psyco} -n ${nrows} -m ${smode} -r -k ${repeats} data.nobackup/sqlite-${nrows}k-${smode}.h5" + echo ${cmd} + ${cmd} + # Finally, delete the source (if desired) + #rm -f data.nobackup/sqlite-${nrows}k-${smode}.h5 + return +} + +overwrite=0 +if [ $# > 1 ]; then + if [ "$1" = "-o" ]; then + overwrite=1 + fi +fi +if [ $# > 2 ]; then + psyco=$2 +fi + +# The name of the data bench file +bfile="sqlite-dbench.h5" + +# Move out a possible previous benchmark file +bn=`basename $bfile ".h5"` +mv -f ${bn}-bck2.h5 ${bn}-bck3.h5 +mv -f ${bn}-bck.h5 ${bn}-bck2.h5 +if [ "$overwrite" = "1" ]; then + echo "moving ${bn}.h5 to ${bn}-bck.h5" + mv -f ${bn}.h5 ${bn}-bck.h5 +else + echo "copying ${bn}.h5 to ${bn}-bck.h5" + cp -f ${bn}.h5 ${bn}-bck.h5 +fi + +# Configuration for testing +nrowsliststd="1 2 5 10 20 50" +#nrowslistidx="1 2 5 10 20 50" +#nrowsliststd="1 2 5 10 20 50 100 200 500 1000 2000 5000 10000 20000 50000" +nrowslistidx="1 2 5 10 20 50 100 200 500 1000 2000 5000 10000" + +#for smode in standard indexed; do +for smode in indexed; do + echo + echo "++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++" + echo "Entering ${smode} mode..." + echo "++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++" + echo + if [ "$smode" = "standard" ]; then + nrowslist=$nrowsliststd + else + nrowslist=$nrowslistidx + fi + # Write data files +# for nrows in $nrowslist; do +# echo "*************************************************************" +# echo "Writing for nrows=$nrows Krows, $smode, psyco=$psyco" +# echo "*************************************************************" +# writedata ${nrows} ${bfile} "${smode}" "${psyco}" +# done + # Read data files + ${python} cacheout.py + for nrows in $nrowslist; do + echo "***********************************************************" + echo "Searching for nrows=$nrows Krows, $smode, psyco=$psyco" + echo "***********************************************************" + readdata ${nrows} ${bfile} "${smode}" "${psyco}" + done +done + +echo "New data available on: $bfile" +exit 0 diff --git a/bench/sqlite3-search-bench.py b/bench/sqlite3-search-bench.py new file mode 100644 index 0000000..8c6735a --- /dev/null +++ b/bench/sqlite3-search-bench.py @@ -0,0 +1,188 @@ +from pathlib import Path +from time import perf_counter as clock +import numpy as np +import random + +# in order to always generate the same random sequence +random.seed(19) + + +def fill_arrays(start, stop): + col_i = np.arange(start, stop, dtype=np.int32) + if userandom: + col_j = np.random.uniform(0, nrows, stop - start) + else: + col_j = np.array(col_i, dtype=np.float64) + return col_i, col_j + +# Generator for ensure pytables benchmark compatibility + + +def int_generator(nrows): + step = 1000 * 100 + j = 0 + for i in range(nrows): + if i >= step * j: + stop = (j + 1) * step + if stop > nrows: # Seems unnecessary + stop = nrows + col_i, col_j = fill_arrays(i, stop) + j += 1 + k = 0 + yield (col_i[k], col_j[k]) + k += 1 + + +def int_generator_slow(nrows): + for i in range(nrows): + if userandom: + yield (i, float(random.randint(0, nrows))) + else: + yield (i, float(i)) + + +def open_db(filename, remove=0): + if remove and Path(filename).is_file(): + Path(filename).unlink() + con = sqlite.connect(filename) + cur = con.cursor() + return con, cur + + +def create_db(filename, nrows): + con, cur = open_db(filename, remove=1) + cur.execute("create table ints(i integer, j real)") + t1 = clock() + # This is twice as fast as a plain loop + cur.executemany("insert into ints(i,j) values (?,?)", int_generator(nrows)) + con.commit() + ctime = clock() - t1 + if verbose: + print(f"insert time: {ctime:.5f}") + print(f"Krows/s: {nrows / 1000 / ctime:.5f}") + close_db(con, cur) + + +def index_db(filename): + con, cur = open_db(filename) + t1 = clock() + cur.execute("create index ij on ints(j)") + con.commit() + itime = clock() - t1 + if verbose: + print(f"index time: {itime:.5f}") + print(f"Krows/s: {nrows / itime:.5f}") + # Close the DB + close_db(con, cur) + + +def query_db(filename, rng): + con, cur = open_db(filename) + t1 = clock() + ntimes = 10 + for i in range(ntimes): + # between clause does not seem to take advantage of indexes + # cur.execute("select j from ints where j between %s and %s" % \ + cur.execute("select i from ints where j >= %s and j <= %s" % + # cur.execute("select i from ints where i >= %s and i <= + # %s" % + (rng[0] + i, rng[1] + i)) + results = cur.fetchall() + con.commit() + qtime = (clock() - t1) / ntimes + if verbose: + print(f"query time: {qtime:.5f}") + print(f"Mrows/s: {nrows / 1000 / qtime:.5f}") + print(results) + close_db(con, cur) + + +def close_db(con, cur): + cur.close() + con.close() + +if __name__ == "__main__": + import sys + import getopt + try: + import psyco + psyco_imported = 1 + except: + psyco_imported = 0 + + usage = """usage: %s [-v] [-p] [-m] [-i] [-q] [-c] [-R range] [-n nrows] file + -v verbose + -p use "psyco" if available + -m use random values to fill the table + -q do query + -c create the database + -i index the table + -2 use sqlite2 (default is use sqlite3) + -R select a range in a field in the form "start,stop" (def "0,10") + -n sets the number of rows (in krows) in each table + \n""" % sys.argv[0] + + try: + opts, pargs = getopt.getopt(sys.argv[1:], 'vpmiqc2R:n:') + except: + sys.stderr.write(usage) + sys.exit(0) + + # default options + verbose = 0 + usepsyco = 0 + userandom = 0 + docreate = 0 + createindex = 0 + doquery = 0 + sqlite_version = "3" + rng = [0, 10] + nrows = 1 + + # Get the options + for option in opts: + if option[0] == '-v': + verbose = 1 + elif option[0] == '-p': + usepsyco = 1 + elif option[0] == '-m': + userandom = 1 + elif option[0] == '-i': + createindex = 1 + elif option[0] == '-q': + doquery = 1 + elif option[0] == '-c': + docreate = 1 + elif option[0] == "-2": + sqlite_version = "2" + elif option[0] == '-R': + rng = [int(i) for i in option[1].split(",")] + elif option[0] == '-n': + nrows = int(option[1]) + + # Catch the hdf5 file passed as the last argument + filename = pargs[0] + + if sqlite_version == "2": + import sqlite + else: + from pysqlite2 import dbapi2 as sqlite + + if verbose: + print("pysqlite version:", sqlite.version) + if userandom: + print("using random values") + + if docreate: + if verbose: + print("writing %s krows" % nrows) + if psyco_imported and usepsyco: + psyco.bind(create_db) + nrows *= 1000 + create_db(filename, nrows) + + if createindex: + index_db(filename) + + if doquery: + query_db(filename, rng) diff --git a/bench/stress-test.py b/bench/stress-test.py new file mode 100644 index 0000000..ab72927 --- /dev/null +++ b/bench/stress-test.py @@ -0,0 +1,399 @@ +import gc +import sys +from time import perf_counter as clock +from time import process_time as cpuclock +import numpy as np +import tables as tb + + +class Test(tb.IsDescription): + ngroup = tb.Int32Col(pos=1) + ntable = tb.Int32Col(pos=2) + nrow = tb.Int32Col(pos=3) + #string = StringCol(itemsize=500, pos=4) + +TestDict = { + "ngroup": tb.Int32Col(pos=1), + "ntable": tb.Int32Col(pos=2), + "nrow": tb.Int32Col(pos=3), +} + + +def createFileArr(filename, ngroups, ntables, nrows): + + # First, create the groups + + # Open a file in "w"rite mode + fileh = tb.open_file(filename, mode="w", title="PyTables Stress Test") + + for k in range(ngroups): + # Create the group + fileh.create_group("/", 'group%04d' % k, "Group %d" % k) + + fileh.close() + + # Now, create the arrays + arr = np.arange(nrows) + for k in range(ngroups): + fileh = tb.open_file(filename, mode="a", root_uep='group%04d' % k) + for j in range(ntables): + # Create the array + fileh.create_array("/", 'array%04d' % j, arr, "Array %d" % j) + fileh.close() + + return (ngroups * ntables * nrows, 4) + + +def readFileArr(filename, ngroups, recsize, verbose): + + rowsread = 0 + for ngroup in range(ngroups): + fileh = tb.open_file(filename, mode="r", root_uep='group%04d' % ngroup) + # Get the group + group = fileh.root + narrai = 0 + if verbose: + print("Group ==>", group) + for arrai in fileh.list_nodes(group, 'Array'): + if verbose > 1: + print("Array ==>", arrai) + print("Rows in", arrai._v_pathname, ":", arrai.shape) + + arr = arrai.read() + + rowsread += len(arr) + narrai += 1 + + # Close the file (eventually destroy the extended type) + fileh.close() + + return (rowsread, 4, rowsread * 4) + + +def createFile(filename, ngroups, ntables, nrows, complevel, complib, recsize): + + # First, create the groups + + # Open a file in "w"rite mode + fileh = tb.open_file(filename, mode="w", title="PyTables Stress Test") + + for k in range(ngroups): + # Create the group + group = fileh.create_group("/", 'group%04d' % k, "Group %d" % k) + + fileh.close() + + # Now, create the tables + rowswritten = 0 + if not ntables: + rowsize = 0 + + for k in range(ngroups): + print("Filling tables in group:", k) + fileh = tb.open_file(filename, mode="a", root_uep='group%04d' % k) + # Get the group + group = fileh.root + for j in range(ntables): + # Create a table + # table = fileh.create_table(group, 'table%04d'% j, Test, + table = fileh.create_table(group, 'table%04d' % j, TestDict, + 'Table%04d' % j, + complevel, complib, nrows) + rowsize = table.rowsize + # Get the row object associated with the new table + row = table.row + # Fill the table + for i in range(nrows): + row['ngroup'] = k + row['ntable'] = j + row['nrow'] = i + row.append() + + rowswritten += nrows + table.flush() + + # Close the file + fileh.close() + + return (rowswritten, rowsize) + + +def readFile(filename, ngroups, recsize, verbose): + # Open the HDF5 file in read-only mode + + rowsize = 0 + buffersize = 0 + rowsread = 0 + for ngroup in range(ngroups): + fileh = tb.open_file(filename, mode="r", root_uep='group%04d' % ngroup) + # Get the group + group = fileh.root + ntable = 0 + if verbose: + print("Group ==>", group) + for table in fileh.list_nodes(group, 'Table'): + rowsize = table.rowsize + buffersize = table.rowsize * table.nrowsinbuf + if verbose > 1: + print("Table ==>", table) + print("Max rows in buf:", table.nrowsinbuf) + print("Rows in", table._v_pathname, ":", table.nrows) + print("Buffersize:", table.rowsize * table.nrowsinbuf) + print("MaxTuples:", table.nrowsinbuf) + + nrow = 0 + if table.nrows > 0: # only read if we have rows in tables + for row in table: + try: + assert row["ngroup"] == ngroup + assert row["ntable"] == ntable + assert row["nrow"] == nrow + except: + print("Error in group: %d, table: %d, row: %d" % + (ngroup, ntable, nrow)) + print("Record ==>", row) + nrow += 1 + + assert nrow == table.nrows + rowsread += table.nrows + ntable += 1 + + # Close the file (eventually destroy the extended type) + fileh.close() + + return (rowsread, rowsize, buffersize) + + +class TrackRefs: + + """Object to track reference counts across test runs.""" + + def __init__(self, verbose=0): + self.type2count = {} + self.type2all = {} + self.verbose = verbose + + def update(self, verbose=0): + obs = sys.getobjects(0) + type2count = {} + type2all = {} + for o in obs: + all = sys.getrefcount(o) + t = type(o) + if verbose: + # if t == types.TupleType: + if isinstance(o, tb.Group): + # if isinstance(o, MetaIsDescription): + print("-->", o, "refs:", all) + refrs = gc.get_referrers(o) + trefrs = [] + for refr in refrs: + trefrs.append(type(refr)) + print("Referrers -->", refrs) + print("Referrers types -->", trefrs) + # if t == types.StringType: print "-->",o + if t in type2count: + type2count[t] += 1 + type2all[t] += all + else: + type2count[t] = 1 + type2all[t] = all + + ct = sorted([(type2count[t] - self.type2count.get(t, 0), + type2all[t] - self.type2all.get(t, 0), + t) + for t in type2count.keys()]) + ct.reverse() + for delta1, delta2, t in ct: + if delta1 or delta2: + print("%-55s %8d %8d" % (t, delta1, delta2)) + + self.type2count = type2count + self.type2all = type2all + + +def dump_refs(preheat=10, iter1=10, iter2=10, *testargs): + + rc1 = rc2 = None + # testMethod() + for i in range(preheat): + testMethod(*testargs) + gc.collect() + rc1 = sys.gettotalrefcount() + track = TrackRefs() + for i in range(iter1): + testMethod(*testargs) + print("First output of TrackRefs:") + gc.collect() + rc2 = sys.gettotalrefcount() + track.update() + print("Inc refs in function testMethod --> %5d" % (rc2 - rc1), + file=sys.stderr) + for i in range(iter2): + testMethod(*testargs) + track.update(verbose=1) + print("Second output of TrackRefs:") + gc.collect() + rc3 = sys.gettotalrefcount() + + print("Inc refs in function testMethod --> %5d" % (rc3 - rc2), + file=sys.stderr) + + +def dump_garbage(): + """show us waht the garbage is about.""" + # Force collection + print("\nGARBAGE:") + gc.collect() + + print("\nGARBAGE OBJECTS:") + for x in gc.garbage: + s = str(x) + #if len(s) > 80: s = s[:77] + "..." + print(type(x), "\n ", s) + + # print "\nTRACKED OBJECTS:" + # reportLoggedInstances("*") + + +def testMethod(file, usearray, testwrite, testread, complib, complevel, + ngroups, ntables, nrows): + + if complevel > 0: + print("Compression library:", complib) + if testwrite: + t1 = clock() + cpu1 = cpuclock() + if usearray: + (rowsw, rowsz) = createFileArr(file, ngroups, ntables, nrows) + else: + (rowsw, rowsz) = createFile(file, ngroups, ntables, nrows, + complevel, complib, recsize) + t2 = clock() + cpu2 = cpuclock() + tapprows = t2 - t1 + cpuapprows = cpu2 - cpu1 + print(f"Rows written: {rowsw} Row size: {rowsz}") + print( + f"Time writing rows: {tapprows:.3f} s (real) " + f"{cpuapprows:.3f} s (cpu) {cpuapprows / tapprows:.0%}") + print(f"Write rows/sec: {rowsw / tapprows}") + print(f"Write KB/s : {rowsw * rowsz / (tapprows * 1024):.0f}") + + if testread: + t1 = clock() + cpu1 = cpuclock() + if usearray: + (rowsr, rowsz, bufsz) = readFileArr(file, + ngroups, recsize, verbose) + else: + (rowsr, rowsz, bufsz) = readFile(file, ngroups, recsize, verbose) + t2 = clock() + cpu2 = cpuclock() + treadrows = t2 - t1 + cpureadrows = cpu2 - cpu1 + print(f"Rows read: {rowsw} Row size: {rowsz}, Buf size: {bufsz}") + print( + f"Time reading rows: {treadrows:.3f} s (real) " + f"{cpureadrows:.3f} s (cpu) {cpureadrows / treadrows:.0%}") + print(f"Read rows/sec: {rowsr / treadrows}") + print(f"Read KB/s : {rowsr * rowsz / (treadrows * 1024):.0f}") + +if __name__ == "__main__": + import getopt + import profile + try: + import psyco + psyco_imported = 1 + except: + psyco_imported = 0 + + usage = """usage: %s [-d debug] [-v level] [-p] [-r] [-w] [-l complib] [-c complevel] [-g ngroups] [-t ntables] [-i nrows] file + -d debugging level + -v verbosity level + -p use "psyco" if available + -a use Array objects instead of Table + -r only read test + -w only write test + -l sets the compression library to be used ("zlib", "lzo", "ucl", "bzip2") + -c sets a compression level (do not set it or 0 for no compression) + -g number of groups hanging from "/" + -t number of tables per group + -i number of rows per table +""" + + try: + opts, pargs = getopt.getopt(sys.argv[1:], 'd:v:parwl:c:g:t:i:') + except: + sys.stderr.write(usage) + sys.exit(0) + + # if we pass too much parameters, abort + if len(pargs) != 1: + sys.stderr.write(usage) + sys.exit(0) + + # default options + ngroups = 5 + ntables = 5 + nrows = 100 + verbose = 0 + debug = 0 + recsize = "medium" + testread = 1 + testwrite = 1 + usepsyco = 0 + usearray = 0 + complevel = 0 + complib = "zlib" + + # Get the options + for option in opts: + if option[0] == '-d': + debug = int(option[1]) + if option[0] == '-v': + verbose = int(option[1]) + if option[0] == '-p': + usepsyco = 1 + if option[0] == '-a': + usearray = 1 + elif option[0] == '-r': + testwrite = 0 + elif option[0] == '-w': + testread = 0 + elif option[0] == '-l': + complib = option[1] + elif option[0] == '-c': + complevel = int(option[1]) + elif option[0] == '-g': + ngroups = int(option[1]) + elif option[0] == '-t': + ntables = int(option[1]) + elif option[0] == '-i': + nrows = int(option[1]) + + if debug: + gc.enable() + + if debug == 1: + gc.set_debug(gc.DEBUG_LEAK) + + # Catch the hdf5 file passed as the last argument + file = pargs[0] + + if psyco_imported and usepsyco: + psyco.bind(createFile) + psyco.bind(readFile) + + if debug == 2: + dump_refs(10, 10, 15, file, usearray, testwrite, testread, complib, + complevel, ngroups, ntables, nrows) + else: +# testMethod(file, usearray, testwrite, testread, complib, complevel, +# ngroups, ntables, nrows) + profile.run("testMethod(file, usearray, testwrite, testread, " + + "complib, complevel, ngroups, ntables, nrows)") + + # Show the dirt + if debug == 1: + dump_garbage() diff --git a/bench/stress-test2.py b/bench/stress-test2.py new file mode 100644 index 0000000..a68635a --- /dev/null +++ b/bench/stress-test2.py @@ -0,0 +1,238 @@ +import gc +import sys +import random +from time import perf_counter as clock +from time import process_time as cpuclock +import tables as tb + + +class Test(tb.IsDescription): + ngroup = tb.Int32Col(pos=1) + ntable = tb.Int32Col(pos=2) + nrow = tb.Int32Col(pos=3) + time = tb.Float64Col(pos=5) + random = tb.Float32Col(pos=4) + + +def createFile(filename, ngroups, ntables, nrows, complevel, complib, recsize): + + # First, create the groups + + # Open a file in "w"rite mode + fileh = tb.open_file(filename, mode="w", title="PyTables Stress Test") + + for k in range(ngroups): + # Create the group + group = fileh.create_group("/", 'group%04d' % k, "Group %d" % k) + + fileh.close() + + # Now, create the tables + rowswritten = 0 + for k in range(ngroups): + fileh = tb.open_file(filename, mode="a", root_uep='group%04d' % k) + # Get the group + group = fileh.root + for j in range(ntables): + # Create a table + table = fileh.create_table(group, 'table%04d' % j, Test, + 'Table%04d' % j, + complevel, complib, nrows) + # Get the row object associated with the new table + row = table.row + # Fill the table + for i in range(nrows): + row['time'] = clock() + row['random'] = random.random() * 40 + 100 + row['ngroup'] = k + row['ntable'] = j + row['nrow'] = i + row.append() + + rowswritten += nrows + table.flush() + + # Close the file + fileh.close() + + return (rowswritten, table.rowsize) + + +def readFile(filename, ngroups, recsize, verbose): + # Open the HDF5 file in read-only mode + + rowsread = 0 + for ngroup in range(ngroups): + fileh = tb.open_file(filename, mode="r", root_uep='group%04d' % ngroup) + # Get the group + group = fileh.root + ntable = 0 + if verbose: + print("Group ==>", group) + for table in fileh.list_nodes(group, 'Table'): + rowsize = table.rowsize + buffersize = table.rowsize * table.nrowsinbuf + if verbose > 1: + print("Table ==>", table) + print("Max rows in buf:", table.nrowsinbuf) + print("Rows in", table._v_pathname, ":", table.nrows) + print("Buffersize:", table.rowsize * table.nrowsinbuf) + print("MaxTuples:", table.nrowsinbuf) + + nrow = 0 + time_1 = 0.0 + for row in table: + try: + # print "row['ngroup'], ngroup ==>", row["ngroup"], ngroup + assert row["ngroup"] == ngroup + assert row["ntable"] == ntable + assert row["nrow"] == nrow + # print "row['time'], time_1 ==>", row["time"], time_1 + assert row["time"] >= (time_1 - 0.01) + #assert 100 <= row["random"] <= 139.999 + assert 100 <= row["random"] <= 140 + except: + print("Error in group: %d, table: %d, row: %d" % + (ngroup, ntable, nrow)) + print("Record ==>", row) + time_1 = row["time"] + nrow += 1 + + assert nrow == table.nrows + rowsread += table.nrows + ntable += 1 + + # Close the file (eventually destroy the extended type) + fileh.close() + + return (rowsread, rowsize, buffersize) + + +def dump_garbage(): + """show us waht the garbage is about.""" + # Force collection + print("\nGARBAGE:") + gc.collect() + + print("\nGARBAGE OBJECTS:") + for x in gc.garbage: + s = str(x) + #if len(s) > 80: s = s[:77] + "..." + print(type(x), "\n ", s) + +if __name__ == "__main__": + import getopt + try: + import psyco + psyco_imported = 1 + except: + psyco_imported = 0 + + usage = """usage: %s [-d debug] [-v level] [-p] [-r] [-w] [-l complib] [-c complevel] [-g ngroups] [-t ntables] [-i nrows] file + -d debugging level + -v verbosity level + -p use "psyco" if available + -r only read test + -w only write test + -l sets the compression library to be used ("zlib", "lzo", "ucl", "bzip2") + -c sets a compression level (do not set it or 0 for no compression) + -g number of groups hanging from "/" + -t number of tables per group + -i number of rows per table +""" + + try: + opts, pargs = getopt.getopt(sys.argv[1:], 'd:v:prwl:c:g:t:i:') + except: + sys.stderr.write(usage) + sys.exit(0) + + # if we pass too much parameters, abort + if len(pargs) != 1: + sys.stderr.write(usage) + sys.exit(0) + + # default options + ngroups = 5 + ntables = 5 + nrows = 100 + verbose = 0 + debug = 0 + recsize = "medium" + testread = 1 + testwrite = 1 + usepsyco = 0 + complevel = 0 + complib = "zlib" + + # Get the options + for option in opts: + if option[0] == '-d': + debug = int(option[1]) + if option[0] == '-v': + verbose = int(option[1]) + if option[0] == '-p': + usepsyco = 1 + elif option[0] == '-r': + testwrite = 0 + elif option[0] == '-w': + testread = 0 + elif option[0] == '-l': + complib = option[1] + elif option[0] == '-c': + complevel = int(option[1]) + elif option[0] == '-g': + ngroups = int(option[1]) + elif option[0] == '-t': + ntables = int(option[1]) + elif option[0] == '-i': + nrows = int(option[1]) + + if debug: + gc.enable() + gc.set_debug(gc.DEBUG_LEAK) + + # Catch the hdf5 file passed as the last argument + file = pargs[0] + + print("Compression level:", complevel) + if complevel > 0: + print("Compression library:", complib) + if testwrite: + t1 = clock() + cpu1 = cpuclock() + if psyco_imported and usepsyco: + psyco.bind(createFile) + (rowsw, rowsz) = createFile(file, ngroups, ntables, nrows, + complevel, complib, recsize) + t2 = clock() + cpu2 = cpuclock() + tapprows = t2 - t1 + cpuapprows = cpu2 - cpu1 + print(f"Rows written: {rowsw} Row size: {rowsz}") + print( + f"Time writing rows: {tapprows:.3f} s (real) " + f"{cpuapprows:.3f} s (cpu) {cpuapprows / tapprows:.0%}") + print(f"Write rows/sec: {rowsw / tapprows}") + print(f"Write KB/s : {rowsw * rowsz / (tapprows * 1024):.0f}") + + if testread: + t1 = clock() + cpu1 = cpuclock() + if psyco_imported and usepsyco: + psyco.bind(readFile) + (rowsr, rowsz, bufsz) = readFile(file, ngroups, recsize, verbose) + t2 = clock() + cpu2 = cpuclock() + treadrows = t2 - t1 + cpureadrows = cpu2 - cpu1 + print(f"Rows read: {rowsw} Row size: {rowsz}, Buf size: {bufsz}") + print( + f"Time reading rows: {treadrows:.3f} s (real) " + f"{cpureadrows:.3f} s (cpu) {cpureadrows / treadrows:.0%}") + print(f"Read rows/sec: {rowsr / treadrows}") + print(f"Read KB/s : {rowsr * rowsz / (treadrows * 1024):.0f}") + + # Show the dirt + if debug > 1: + dump_garbage() diff --git a/bench/stress-test3.py b/bench/stress-test3.py new file mode 100644 index 0000000..233166e --- /dev/null +++ b/bench/stress-test3.py @@ -0,0 +1,290 @@ +#!/usr/bin/env python + +"""This script allows to create arbitrarily large files with the desired +combination of groups, tables per group and rows per table. + +Issue "python stress-test3.py" without parameters for a help on usage. + +""" + +import gc +import sys +from time import perf_counter as clock +from time import process_time as cpuclock +import tables as tb + + +class Test(tb.IsDescription): + ngroup = tb.Int32Col(pos=1) + ntable = tb.Int32Col(pos=2) + nrow = tb.Int32Col(pos=3) + string = tb.StringCol(500, pos=4) + + +def createFileArr(filename, ngroups, ntables, nrows): + + # First, create the groups + + # Open a file in "w"rite mode + fileh = tb.open_file(filename, mode="w", title="PyTables Stress Test") + + for k in range(ngroups): + # Create the group + fileh.create_group("/", 'group%04d' % k, "Group %d" % k) + + fileh.close() + + return (0, 4) + + +def readFileArr(filename, ngroups, recsize, verbose): + + rowsread = 0 + for ngroup in range(ngroups): + fileh = tb.open_file(filename, mode="r", root_uep='group%04d' % ngroup) + # Get the group + group = fileh.root + ntable = 0 + if verbose: + print("Group ==>", group) + for table in fileh.list_nodes(group, 'Array'): + if verbose > 1: + print("Array ==>", table) + print("Rows in", table._v_pathname, ":", table.shape) + + arr = table.read() + + rowsread += len(arr) + ntable += 1 + + # Close the file (eventually destroy the extended type) + fileh.close() + + return (rowsread, 4, 0) + + +def createFile(filename, ngroups, ntables, nrows, complevel, complib, recsize): + + # First, create the groups + + # Open a file in "w"rite mode + fileh = tb.open_file(filename, mode="w", title="PyTables Stress Test") + + for k in range(ngroups): + # Create the group + group = fileh.create_group("/", 'group%04d' % k, "Group %d" % k) + + fileh.close() + + # Now, create the tables + rowswritten = 0 + for k in range(ngroups): + fileh = tb.open_file(filename, mode="a", root_uep='group%04d' % k) + # Get the group + group = fileh.root + for j in range(ntables): + # Create a table + table = fileh.create_table(group, 'table%04d' % j, Test, + 'Table%04d' % j, + tb.Filters(complevel, complib), nrows) + rowsize = table.rowsize + # Get the row object associated with the new table + row = table.row + # Fill the table + for i in range(nrows): + row['ngroup'] = k + row['ntable'] = j + row['nrow'] = i + row.append() + + rowswritten += nrows + table.flush() + + # Close the file + fileh.close() + + return (rowswritten, rowsize) + + +def readFile(filename, ngroups, recsize, verbose): + # Open the HDF5 file in read-only mode + + rowsread = 0 + for ngroup in range(ngroups): + fileh = tb.open_file(filename, mode="r", root_uep='group%04d' % ngroup) + # Get the group + group = fileh.root + ntable = 0 + if verbose: + print("Group ==>", group) + for table in fileh.list_nodes(group, 'Table'): + rowsize = table.rowsize + buffersize = table.rowsize * table.nrowsinbuf + if verbose > 1: + print("Table ==>", table) + print("Max rows in buf:", table.nrowsinbuf) + print("Rows in", table._v_pathname, ":", table.nrows) + print("Buffersize:", table.rowsize * table.nrowsinbuf) + print("MaxTuples:", table.nrowsinbuf) + + nrow = 0 + for row in table: + try: + assert row["ngroup"] == ngroup + assert row["ntable"] == ntable + assert row["nrow"] == nrow + except: + print("Error in group: %d, table: %d, row: %d" % + (ngroup, ntable, nrow)) + print("Record ==>", row) + nrow += 1 + + assert nrow == table.nrows + rowsread += table.nrows + ntable += 1 + + # Close the file (eventually destroy the extended type) + fileh.close() + + return (rowsread, rowsize, buffersize) + + +def dump_garbage(): + """show us waht the garbage is about.""" + # Force collection + print("\nGARBAGE:") + gc.collect() + + print("\nGARBAGE OBJECTS:") + for x in gc.garbage: + s = str(x) + #if len(s) > 80: s = s[:77] + "..." + print(type(x), "\n ", s) + +if __name__ == "__main__": + import getopt + try: + import psyco + psyco_imported = 1 + except: + psyco_imported = 0 + + usage = """usage: %s [-d debug] [-v level] [-p] [-r] [-w] [-l complib] [-c complevel] [-g ngroups] [-t ntables] [-i nrows] file + -d debugging level + -v verbosity level + -p use "psyco" if available + -a use Array objects instead of Table + -r only read test + -w only write test + -l sets the compression library to be used ("zlib", "lzo", "ucl", "bzip2") + -c sets a compression level (do not set it or 0 for no compression) + -g number of groups hanging from "/" + -t number of tables per group + -i number of rows per table +""" + + try: + opts, pargs = getopt.getopt(sys.argv[1:], 'd:v:parwl:c:g:t:i:') + except: + sys.stderr.write(usage) + sys.exit(0) + + # if we pass too much parameters, abort + if len(pargs) != 1: + sys.stderr.write(usage) + sys.exit(0) + + # default options + ngroups = 5 + ntables = 5 + nrows = 100 + verbose = 0 + debug = 0 + recsize = "medium" + testread = 1 + testwrite = 1 + usepsyco = 0 + usearray = 0 + complevel = 0 + complib = "zlib" + + # Get the options + for option in opts: + if option[0] == '-d': + debug = int(option[1]) + if option[0] == '-v': + verbose = int(option[1]) + if option[0] == '-p': + usepsyco = 1 + if option[0] == '-a': + usearray = 1 + elif option[0] == '-r': + testwrite = 0 + elif option[0] == '-w': + testread = 0 + elif option[0] == '-l': + complib = option[1] + elif option[0] == '-c': + complevel = int(option[1]) + elif option[0] == '-g': + ngroups = int(option[1]) + elif option[0] == '-t': + ntables = int(option[1]) + elif option[0] == '-i': + nrows = int(option[1]) + + if debug: + gc.enable() + gc.set_debug(gc.DEBUG_LEAK) + + # Catch the hdf5 file passed as the last argument + file = pargs[0] + + print("Compression level:", complevel) + if complevel > 0: + print("Compression library:", complib) + if testwrite: + t1 = clock() + cpu1 = cpuclock() + if psyco_imported and usepsyco: + psyco.bind(createFile) + if usearray: + (rowsw, rowsz) = createFileArr(file, ngroups, ntables, nrows) + else: + (rowsw, rowsz) = createFile(file, ngroups, ntables, nrows, + complevel, complib, recsize) + t2 = clock() + cpu2 = cpuclock() + tapprows = t2 - t1 + cpuapprows = cpu2 - cpu1 + print(f"Rows written: {rowsw} Row size: {rowsz}") + print( + f"Time writing rows: {tapprows:.3f} s (real) " + f"{cpuapprows:.3f} s (cpu) {cpuapprows / tapprows:.0%}") + print(f"Write rows/sec: {rowsw / tapprows}") + print(f"Write KB/s : {rowsw * rowsz / (tapprows * 1024):.0f}") + + if testread: + t1 = clock() + cpu1 = cpuclock() + if psyco_imported and usepsyco: + psyco.bind(readFile) + if usearray: + (rowsr, rowsz, bufsz) = readFileArr(file, + ngroups, recsize, verbose) + else: + (rowsr, rowsz, bufsz) = readFile(file, ngroups, recsize, verbose) + t2 = clock() + cpu2 = cpuclock() + treadrows = t2 - t1 + cpureadrows = cpu2 - cpu1 + print(f"Rows read: {rowsw} Row size: {rowsz}, Buf size: {bufsz}") + print( + f"Time reading rows: {treadrows:.3f} s (real) " + f"{cpureadrows:.3f} s (cpu) {cpureadrows / treadrows:.0%}") + print(f"Read rows/sec: {rowsr / treadrows}") + print(f"Read KB/s : {rowsr * rowsz / (treadrows * 1024):.0f}") + + # Show the dirt + if debug > 1: + dump_garbage() diff --git a/bench/table-bench.py b/bench/table-bench.py new file mode 100644 index 0000000..dd2f3e1 --- /dev/null +++ b/bench/table-bench.py @@ -0,0 +1,424 @@ +#!/usr/bin/env python + +import numpy as np +import tables as tb + +# This class is accessible only for the examples + + +class Small(tb.IsDescription): + var1 = tb.StringCol(itemsize=4, pos=2) + var2 = tb.Int32Col(pos=1) + var3 = tb.Float64Col(pos=0) + +# Define a user record to characterize some kind of particles + + +class Medium(tb.IsDescription): + name = tb.StringCol(itemsize=16, pos=0) # 16-character String + float1 = tb.Float64Col(shape=2, dflt=np.arange(2), pos=1) + #float1 = Float64Col(dflt=2.3) + #float2 = Float64Col(dflt=2.3) + # zADCcount = Int16Col() # signed short integer + ADCcount = tb.Int32Col(pos=6) # signed short integer + grid_i = tb.Int32Col(pos=7) # integer + grid_j = tb.Int32Col(pos=8) # integer + pressure = tb.Float32Col(pos=9) # float (single-precision) + energy = tb.Float64Col(pos=2) # double (double-precision) + # unalig = Int8Col() # just to unalign data + +# Define a user record to characterize some kind of particles + + +class Big(tb.IsDescription): + name = tb.StringCol(itemsize=16) # 16-character String + float1 = tb.Float64Col(shape=32, dflt=np.arange(32)) + float2 = tb.Float64Col(shape=32, dflt=2.2) + TDCcount = tb.Int8Col() # signed short integer + #ADCcount = Int32Col() + # ADCcount = Int16Col() # signed short integer + grid_i = tb.Int32Col() # integer + grid_j = tb.Int32Col() # integer + pressure = tb.Float32Col() # float (single-precision) + energy = tb.Float64Col() # double (double-precision) + + +def createFile(filename, totalrows, filters, recsize): + + # Open a file in "w"rite mode + fileh = tb.open_file(filename, mode="w", title="Table Benchmark", + filters=filters) + + # Table title + title = "This is the table title" + + # Create a Table instance + group = fileh.root + rowswritten = 0 + for j in range(3): + # Create a table + if recsize == "big": + table = fileh.create_table(group, 'tuple' + str(j), Big, title, + None, + totalrows) + elif recsize == "medium": + table = fileh.create_table(group, 'tuple' + str(j), Medium, title, + None, + totalrows) + elif recsize == "small": + table = fileh.create_table(group, 'tuple' + str(j), Small, title, + None, + totalrows) + else: + raise RuntimeError("This should never happen") + + table.attrs.test = 2 + rowsize = table.rowsize + # Get the row object associated with the new table + d = table.row + # Fill the table + if recsize == "big": + for i in range(totalrows): + # d['name'] = 'Part: %6d' % (i) + d['TDCcount'] = i % 256 + #d['float1'] = NP.array([i]*32, NP.float64) + #d['float2'] = NP.array([i**2]*32, NP.float64) + #d['float1'][0] = float(i) + #d['float2'][0] = float(i*2) + # Common part with medium + d['grid_i'] = i + d['grid_j'] = 10 - i + d['pressure'] = float(i * i) + # d['energy'] = float(d['pressure'] ** 4) + d['energy'] = d['pressure'] + # d['idnumber'] = i * (2 ** 34) + d.append() + elif recsize == "medium": + for i in range(totalrows): + #d['name'] = 'Part: %6d' % (i) + #d['float1'] = NP.array([i]*2, NP.float64) + #d['float1'] = arr + #d['float1'] = i + #d['float2'] = float(i) + # Common part with big: + d['grid_i'] = i + d['grid_j'] = 10 - i + d['pressure'] = i * 2 + # d['energy'] = float(d['pressure'] ** 4) + d['energy'] = d['pressure'] + d.append() + else: # Small record + for i in range(totalrows): + #d['var1'] = str(random.randrange(1000000)) + #d['var3'] = random.randrange(10000000) + d['var1'] = str(i) + #d['var2'] = random.randrange(totalrows) + d['var2'] = i + #d['var3'] = 12.1e10 + d['var3'] = totalrows - i + d.append() # This is a 10% faster than table.append() + rowswritten += totalrows + + if recsize == "small": + # Testing with indexing + pass +# table._createIndex("var3", Filters(1,"zlib",shuffle=1)) + + # table.flush() + group._v_attrs.test2 = "just a test" + # Create a new group + group2 = fileh.create_group(group, 'group' + str(j)) + # Iterate over this new group (group2) + group = group2 + table.flush() + + # Close the file (eventually destroy the extended type) + fileh.close() + return (rowswritten, rowsize) + + +def readFile(filename, recsize, verbose): + # Open the HDF5 file in read-only mode + + fileh = tb.open_file(filename, mode="r") + rowsread = 0 + for groupobj in fileh.walk_groups(fileh.root): + # print "Group pathname:", groupobj._v_pathname + row = 0 + for table in fileh.list_nodes(groupobj, 'Table'): + rowsize = table.rowsize + print("reading", table) + if verbose: + print("Max rows in buf:", table.nrowsinbuf) + print("Rows in", table._v_pathname, ":", table.nrows) + print("Buffersize:", table.rowsize * table.nrowsinbuf) + print("MaxTuples:", table.nrowsinbuf) + + if recsize == "big" or recsize == "medium": + # e = [ p.float1 for p in table.iterrows() + # if p.grid_i < 2 ] + #e = [ str(p) for p in table.iterrows() ] + # if p.grid_i < 2 ] +# e = [ p['grid_i'] for p in table.iterrows() +# if p['grid_j'] == 20 and p['grid_i'] < 20 ] +# e = [ p['grid_i'] for p in table +# if p['grid_i'] <= 2 ] +# e = [ p['grid_i'] for p in table.where("grid_i<=20")] +# e = [ p['grid_i'] for p in +# table.where('grid_i <= 20')] + e = [p['grid_i'] for p in + table.where('(grid_i <= 20) & (grid_j == 20)')] +# e = [ p['grid_i'] for p in table.iterrows() +# if p.nrow() == 20 ] +# e = [ table.delrow(p.nrow()) for p in table.iterrows() +# if p.nrow() == 20 ] + # The version with a for loop is only 1% better than + # comprenhension list + #e = [] + # for p in table.iterrows(): + # if p.grid_i < 20: + # e.append(p.grid_j) + else: # small record case +# e = [ p['var3'] for p in table.iterrows() +# if p['var2'] < 20 and p['var3'] < 20 ] +# e = [ p['var3'] for p in table.where("var3 <= 20") +# if p['var2'] < 20 ] +# e = [ p['var3'] for p in table.where("var3 <= 20")] +# Cuts 1) and 2) issues the same results but 2) is about 10 times faster +# Cut 1) +# e = [ p.nrow() for p in +# table.where(table.cols.var2 > 5) +# if p["var2"] < 10] +# Cut 2) +# e = [ p.nrow() for p in +# table.where(table.cols.var2 < 10) +# if p["var2"] > 5] +# e = [ (p._nrow,p["var3"]) for p in +# e = [ p["var3"] for p in +# table.where(table.cols.var3 < 10)] +# table.where(table.cols.var3 < 10)] +# table if p["var3"] <= 10] +# e = [ p['var3'] for p in table.where("var3 <= 20")] +# e = [ p['var3'] for p in +# table.where(table.cols.var1 == "10")] # More + # than ten times faster than the next one +# e = [ p['var3'] for p in table +# if p['var1'] == "10"] +# e = [ p['var3'] for p in table.where('var2 <= 20')] + e = [p['var3'] + for p in table.where('(var2 <= 20) & (var2 >= 3)')] + # e = [ p[0] for p in table.where('var2 <= 20')] + #e = [ p['var3'] for p in table if p['var2'] <= 20 ] + # e = [ p[:] for p in table if p[1] <= 20 ] +# e = [ p['var3'] for p in table._whereInRange(table.cols.var2 <=20)] + #e = [ p['var3'] for p in table.iterrows(0,21) ] +# e = [ p['var3'] for p in table.iterrows() +# if p.nrow() <= 20 ] + #e = [ p['var3'] for p in table.iterrows(1,0,1000)] + #e = [ p['var3'] for p in table.iterrows(1,100)] + # e = [ p['var3'] for p in table.iterrows(step=2) + # if p.nrow() < 20 ] + # e = [ p['var2'] for p in table.iterrows() + # if p['var2'] < 20 ] + # for p in table.iterrows(): + # pass + if verbose: + # print "Last record read:", p + print("resulting selection list ==>", e) + + rowsread += table.nrows + row += 1 + if verbose: + print("Total selected records ==> ", len(e)) + + # Close the file (eventually destroy the extended type) + fileh.close() + + return (rowsread, rowsize) + + +def readField(filename, field, rng, verbose): + fileh = tb.open_file(filename, mode="r") + rowsread = 0 + if rng is None: + rng = [0, -1, 1] + if field == "all": + field = None + for groupobj in fileh.walk_groups(fileh.root): + for table in fileh.list_nodes(groupobj, 'Table'): + rowsize = table.rowsize + # table.nrowsinbuf = 3 # For testing purposes + if verbose: + print("Max rows in buf:", table.nrowsinbuf) + print("Rows in", table._v_pathname, ":", table.nrows) + print("Buffersize:", table.rowsize * table.nrowsinbuf) + print("MaxTuples:", table.nrowsinbuf) + print("(field, start, stop, step) ==>", (field, rng[0], rng[1], + rng[2])) + + e = table.read(rng[0], rng[1], rng[2], field) + + rowsread += table.nrows + if verbose: + print("Selected rows ==> ", e) + print("Total selected rows ==> ", len(e)) + + # Close the file (eventually destroy the extended type) + fileh.close() + return (rowsread, rowsize) + +if __name__ == "__main__": + import sys + import getopt + + try: + import psyco + psyco_imported = 1 + except: + psyco_imported = 0 + + from time import perf_counter as clock + from time import process_time as cpuclock + + usage = """usage: %s [-v] [-p] [-P] [-R range] [-r] [-w] [-s recsize] [-f field] [-c level] [-l complib] [-i iterations] [-S] [-F] file + -v verbose + -p use "psyco" if available + -P do profile + -R select a range in a field in the form "start,stop,step" + -r only read test + -w only write test + -s use [big] record, [medium] or [small] + -f only read stated field name in tables ("all" means all fields) + -c sets a compression level (do not set it or 0 for no compression) + -S activate shuffling filter + -F activate fletcher32 filter + -l sets the compression library to be used ("zlib", "lzo", "blosc", "bzip2") + -i sets the number of rows in each table\n""" % sys.argv[0] + + try: + opts, pargs = getopt.getopt(sys.argv[1:], 'vpPSFR:rwf:s:c:l:i:') + except: + sys.stderr.write(usage) + sys.exit(0) + + # if we pass too much parameters, abort + if len(pargs) != 1: + sys.stderr.write(usage) + sys.exit(0) + + # default options + verbose = 0 + profile = 0 + rng = None + recsize = "medium" + fieldName = None + testread = 1 + testwrite = 1 + usepsyco = 0 + complevel = 0 + shuffle = 0 + fletcher32 = 0 + complib = "zlib" + iterations = 100 + + # Get the options + for option in opts: + if option[0] == '-v': + verbose = 1 + if option[0] == '-p': + usepsyco = 1 + if option[0] == '-P': + profile = 1 + if option[0] == '-S': + shuffle = 1 + if option[0] == '-F': + fletcher32 = 1 + elif option[0] == '-R': + rng = [int(i) for i in option[1].split(",")] + elif option[0] == '-r': + testwrite = 0 + elif option[0] == '-w': + testread = 0 + elif option[0] == '-f': + fieldName = option[1] + elif option[0] == '-s': + recsize = option[1] + if recsize not in ["big", "medium", "small"]: + sys.stderr.write(usage) + sys.exit(0) + elif option[0] == '-c': + complevel = int(option[1]) + elif option[0] == '-l': + complib = option[1] + elif option[0] == '-i': + iterations = int(option[1]) + + # Build the Filters instance + filters = tb.Filters(complevel=complevel, complib=complib, + shuffle=shuffle, fletcher32=fletcher32) + + # Catch the hdf5 file passed as the last argument + file = pargs[0] + + if verbose: + print("numpy version:", np.__version__) + if psyco_imported and usepsyco: + print("Using psyco version:", psyco.version_info) + + if testwrite: + print("Compression level:", complevel) + if complevel > 0: + print("Compression library:", complib) + if shuffle: + print("Suffling...") + t1 = clock() + cpu1 = cpuclock() + if psyco_imported and usepsyco: + psyco.bind(createFile) + if profile: + import profile as prof + import pstats + prof.run( + '(rowsw, rowsz) = createFile(file, iterations, filters, ' + 'recsize)', + 'table-bench.prof') + stats = pstats.Stats('table-bench.prof') + stats.strip_dirs() + stats.sort_stats('time', 'calls') + stats.print_stats(20) + else: + (rowsw, rowsz) = createFile(file, iterations, filters, recsize) + t2 = clock() + cpu2 = cpuclock() + tapprows = t2 - t1 + cpuapprows = cpu2 - cpu1 + print(f"Rows written: {rowsw} Row size: {rowsz}") + print( + f"Time writing rows: {tapprows:.3f} s (real) " + f"{cpuapprows:.3f} s (cpu) {cpuapprows / tapprows:.0%}") + print(f"Write rows/sec: {rowsw / tapprows}") + print(f"Write KB/s : {rowsw * rowsz / (tapprows * 1024):.0f}") + + if testread: + t1 = clock() + cpu1 = cpuclock() + if psyco_imported and usepsyco: + psyco.bind(readFile) + # psyco.bind(readField) + pass + if rng or fieldName: + (rowsr, rowsz) = readField(file, fieldName, rng, verbose) + pass + else: + for i in range(1): + (rowsr, rowsz) = readFile(file, recsize, verbose) + t2 = clock() + cpu2 = cpuclock() + treadrows = t2 - t1 + cpureadrows = cpu2 - cpu1 + print(f"Rows read: {rowsw} Row size: {rowsz}") + print( + f"Time reading rows: {treadrows:.3f} s (real) " + f"{cpureadrows:.3f} s (cpu) {cpureadrows / treadrows:.0%}") + print(f"Read rows/sec: {rowsr / treadrows}") + print(f"Read KB/s : {rowsr * rowsz / (treadrows * 1024):.0f}") diff --git a/bench/table-copy.py b/bench/table-copy.py new file mode 100644 index 0000000..af62911 --- /dev/null +++ b/bench/table-copy.py @@ -0,0 +1,115 @@ +from time import perf_counter as clock + +import numpy as np +import tables as tb + +N = 144_000 +#N = 144 + + +def timed(func, *args, **kwargs): + start = clock() + res = func(*args, **kwargs) + print(f"{clock() - start:.3f}s elapsed.") + return res + + +def create_table(output_path): + print("creating array...", end=' ') + dt = np.dtype([('field%d' % i, int) for i in range(320)]) + a = np.zeros(N, dtype=dt) + print("done.") + + output_file = tb.open_file(output_path, mode="w") + table = output_file.create_table("/", "test", dt) # , filters=blosc4) + print("appending data...", end=' ') + table.append(a) + print("flushing...", end=' ') + table.flush() + print("done.") + output_file.close() + + +def copy1(input_path, output_path): + print(f"copying data from {input_path} to {output_path}...") + input_file = tb.open_file(input_path, mode="r") + output_file = tb.open_file(output_path, mode="w") + + # copy nodes as a batch + input_file.copy_node("/", output_file.root, recursive=True) + output_file.close() + input_file.close() + + +def copy2(input_path, output_path): + print(f"copying data from {input_path} to {output_path}...") + input_file = tb.open_file(input_path, mode="r") + input_file.copy_file(output_path, overwrite=True) + input_file.close() + + +def copy3(input_path, output_path): + print(f"copying data from {input_path} to {output_path}...") + input_file = tb.open_file(input_path, mode="r") + output_file = tb.open_file(output_path, mode="w") + table = input_file.root.test + table.copy(output_file.root) + output_file.close() + input_file.close() + + +def copy4(input_path, output_path, complib='zlib', complevel=0): + print(f"copying data from {input_path} to {output_path}...") + input_file = tb.open_file(input_path, mode="r") + output_file = tb.open_file(output_path, mode="w") + + input_table = input_file.root.test + print("reading data...", end=' ') + data = input_file.root.test.read() + print("done.") + + filter = tb.Filters(complevel=complevel, complib=complib) + output_table = output_file.create_table("/", "test", input_table.dtype, + filters=filter) + print("appending data...", end=' ') + output_table.append(data) + print("flushing...", end=' ') + output_table.flush() + print("done.") + + input_file.close() + output_file.close() + + +def copy5(input_path, output_path, complib='zlib', complevel=0): + print(f"copying data from {input_path} to {output_path}...") + input_file = tb.open_file(input_path, mode="r") + output_file = tb.open_file(output_path, mode="w") + + input_table = input_file.root.test + + filter = tb.Filters(complevel=complevel, complib=complib) + output_table = output_file.create_table("/", "test", input_table.dtype, + filters=filter) + chunksize = 10_000 + rowsleft = len(input_table) + start = 0 + for chunk in range((len(input_table) / chunksize) + 1): + stop = start + min(chunksize, rowsleft) + data = input_table.read(start, stop) + output_table.append(data) + output_table.flush() + rowsleft -= chunksize + start = stop + + input_file.close() + output_file.close() + + +if __name__ == '__main__': + timed(create_table, 'tmp.h5') +# timed(copy1, 'tmp.h5', 'test1.h5') + timed(copy2, 'tmp.h5', 'test2.h5') +# timed(copy3, 'tmp.h5', 'test3.h5') + timed(copy4, 'tmp.h5', 'test4.h5') + timed(copy5, 'tmp.h5', 'test5.h5') diff --git a/bench/undo_redo.py b/bench/undo_redo.py new file mode 100644 index 0000000..b715c91 --- /dev/null +++ b/bench/undo_redo.py @@ -0,0 +1,228 @@ +"""Benchmark for undo/redo. +Run this program without parameters for mode of use.""" + +from time import perf_counter as clock +import numpy as np +import tables as tb + +verbose = 0 + + +class BasicBenchmark: + + def __init__(self, filename, testname, vecsize, nobjects, niter): + + self.file = filename + self.test = testname + self.vecsize = vecsize + self.nobjects = nobjects + self.niter = niter + + # Initialize the arrays + self.a1 = np.arange(0, 1 * self.vecsize) + self.a2 = np.arange(1 * self.vecsize, 2 * self.vecsize) + self.a3 = np.arange(2 * self.vecsize, 3 * self.vecsize) + + def setUp(self): + + # Create an HDF5 file + self.fileh = tb.open_file(self.file, mode="w") + # open the do/undo + self.fileh.enable_undo() + + def tearDown(self): + self.fileh.disable_undo() + self.fileh.close() + # Remove the temporary file + # os.remove(self.file) + + def createNode(self): + """Checking a undo/redo create_array.""" + + for i in range(self.nobjects): + # Create a new array + self.fileh.create_array('/', 'array' + str(i), self.a1) + # Put a mark + self.fileh.mark() + # Unwind all marks sequentially + for i in range(self.niter): + t1 = clock() + for i in range(self.nobjects): + self.fileh.undo() + if verbose: + print("u", end=' ') + if verbose: + print() + undo = clock() - t1 + # Rewind all marks sequentially + t1 = clock() + for i in range(self.nobjects): + self.fileh.redo() + if verbose: + print("r", end=' ') + if verbose: + print() + redo = clock() - t1 + + print("Time for Undo, Redo (createNode):", undo, "s, ", redo, "s") + + def copy_children(self): + """Checking a undo/redo copy_children.""" + + # Create a group + self.fileh.create_group('/', 'agroup') + # Create several objects there + for i in range(10): + # Create a new array + self.fileh.create_array('/agroup', 'array' + str(i), self.a1) + # Excercise copy_children + for i in range(self.nobjects): + # Create another group for destination + self.fileh.create_group('/', 'anothergroup' + str(i)) + # Copy children from /agroup to /anothergroup+i + self.fileh.copy_children('/agroup', '/anothergroup' + str(i)) + # Put a mark + self.fileh.mark() + # Unwind all marks sequentially + for i in range(self.niter): + t1 = clock() + for i in range(self.nobjects): + self.fileh.undo() + if verbose: + print("u", end=' ') + if verbose: + print() + undo = clock() - t1 + # Rewind all marks sequentially + t1 = clock() + for i in range(self.nobjects): + self.fileh.redo() + if verbose: + print("r", end=' ') + if verbose: + print() + redo = clock() - t1 + + print(("Time for Undo, Redo (copy_children):", undo, "s, ", + redo, "s")) + + def set_attr(self): + """Checking a undo/redo for setting attributes.""" + + # Create a new array + self.fileh.create_array('/', 'array', self.a1) + for i in range(self.nobjects): + # Set an attribute + setattr(self.fileh.root.array.attrs, "attr" + str(i), str(self.a1)) + # Put a mark + self.fileh.mark() + # Unwind all marks sequentially + for i in range(self.niter): + t1 = clock() + for i in range(self.nobjects): + self.fileh.undo() + if verbose: + print("u", end=' ') + if verbose: + print() + undo = clock() - t1 + # Rewind all marks sequentially + t1 = clock() + for i in range(self.nobjects): + self.fileh.redo() + if verbose: + print("r", end=' ') + if verbose: + print() + redo = clock() - t1 + + print("Time for Undo, Redo (set_attr):", undo, "s, ", redo, "s") + + def runall(self): + + if testname == "all": + tests = [self.createNode, self.copy_children, self.set_attr] + elif testname == "createNode": + tests = [self.createNode] + elif testname == "copy_children": + tests = [self.copy_children] + elif testname == "set_attr": + tests = [self.set_attr] + for meth in tests: + self.setUp() + meth() + self.tearDown() + + +if __name__ == '__main__': + import sys + import getopt + + usage = """usage: %s [-v] [-p] [-t test] [-s vecsize] [-n niter] datafile + -v verbose (total dump of profiling) + -p do profiling + -t {createNode|copy_children|set_attr|all} run the specified test + -s the size of vectors that are undone/redone + -n number of objects in operations + -i number of iterations for reading\n""" % sys.argv[0] + + try: + opts, pargs = getopt.getopt(sys.argv[1:], 'vpt:s:n:i:') + except: + sys.stderr.write(usage) + sys.exit(0) + + # if we pass too much parameters, abort + if len(pargs) != 1: + sys.stderr.write(usage) + sys.exit(0) + + # default options + verbose = 0 + profile = 0 + testname = "all" + vecsize = 10 + nobjects = 1 + niter = 1 + + # Get the options + for option in opts: + if option[0] == '-v': + verbose = 1 + elif option[0] == '-p': + profile = 1 + elif option[0] == '-t': + testname = option[1] + if testname not in ['createNode', 'copy_children', 'set_attr', + 'all']: + sys.stderr.write(usage) + sys.exit(0) + elif option[0] == '-s': + vecsize = int(option[1]) + elif option[0] == '-n': + nobjects = int(option[1]) + elif option[0] == '-i': + niter = int(option[1]) + + filename = pargs[0] + + bench = BasicBenchmark(filename, testname, vecsize, nobjects, niter) + if profile: + import hotshot + import hotshot.stats + prof = hotshot.Profile("do_undo.prof") + prof.runcall(bench.runall) + prof.close() + stats = hotshot.stats.load("do_undo.prof") + stats.strip_dirs() + stats.sort_stats('time', 'calls') + if verbose: + stats.print_stats() + else: + stats.print_stats(20) + else: + bench.runall() + +# Local Variables: +# mode: python +# End: diff --git a/bench/undo_redo.txt b/bench/undo_redo.txt new file mode 100644 index 0000000..8f0890e --- /dev/null +++ b/bench/undo_redo.txt @@ -0,0 +1,103 @@ +Benchmarks on PyTables Undo/Redo +================================ + +This is a small report for the performance of the Undo/Redo feature in +PyTables. + +A small script (see undo_redo.py) has been made in order to check +different scenarios for Undo/Redo, like creating single nodes, copying +children from one group to another, and creating attributes. + +Undo/Redo is independent of object size +--------------------------------------- + +Firstly, one thing to be noted is that the Undo/Redo feature is +independent of the object size that is being treated. For example, the +times for 10 objects (flag -n) each one with 10 elements (flag -s) is: + +$ time python2.4 undo_redo.py -n 10 -i 2 -s 10 data.nobackup/undo_redo.h5 +Time for Undo, Redo (createNode): 0.213686943054 s, 0.0727670192719 s +Time for Undo, Redo (createNode): 0.271666049957 s, 0.0740389823914 s +Time for Undo, Redo (copy_children): 0.296227931976 s, 0.161941051483 s +Time for Undo, Redo (copy_children): 0.363519906998 s, 0.162662982941 s +Time for Undo, Redo (set_attr): 0.208750009537 s, 0.0732419490814 s +Time for Undo, Redo (set_attr): 0.27628993988 s, 0.0736088752747 s + +real 0m5.557s +user 0m4.354s +sys 0m0.729s + +Note how all tests take more or less the same amount of time. This is +because a move operation is used as a central tool to implement the +Undo/Redo feature. Such a move operation has a constant cost, +independently of the size of the objects. For example, using objects +with 1000 elements, we can see that this does not affect the Undo/Redo +speed: + +$ time python2.4 undo_redo.py -n 10 -i 2 -s 1000 data.nobackup/undo_redo.h5 +Time for Undo, Redo (createNode): 0.213760137558 s, 0.0717759132385 s +Time for Undo, Redo (createNode): 0.276151895523 s, 0.0724079608917 s +Time for Undo, Redo (copy_children): 0.308417797089 s, 0.168260812759 s +Time for Undo, Redo (copy_children): 0.382102966309 s, 0.168042898178 s +Time for Undo, Redo (set_attr): 0.209735155106 s, 0.0740969181061 s +Time for Undo, Redo (set_attr): 0.279798984528 s, 0.0770981311798 s + +real 0m5.835s +user 0m4.585s +sys 0m0.736s + + +Undo/Redo times grow linearly with the number of objects implied +---------------------------------------------------------------- + +Secondly, the time for doing/undoing is obviously proportional +(linearly) to the number of objects that are implied in that process +(set by -n): + +$ time python2.4 undo_redo.py -n 100 -i 2 -s 10 data.nobackup/undo_redo.h5 +Time for Undo, Redo (createNode): 2.27267885208 s, 0.779091119766 s +Time for Undo, Redo (createNode): 2.31264209747 s, 0.766252040863 s +Time for Undo, Redo (copy_children): 3.01871585846 s, 1.63346219063 s +Time for Undo, Redo (copy_children): 3.07704997063 s, 1.62615203857 s +Time for Undo, Redo (set_attr): 2.18017196655 s, 0.809293985367 s +Time for Undo, Redo (set_attr): 2.23039293289 s, 0.809432029724 s + +real 0m48.395s +user 0m40.385s +sys 0m6.914s + + +A note on actual performance and place for improvement +------------------------------------------------------ + +Finally, note how the Undo/Redo capability of PyTables is pretty +fast. The next benchmark makes 1000 undo and 1000 redos for +create_array: + +$ time python2.4 undo_redo.py -n 1000 -i 2 -t createNode -s 1000 data.nobackup/undo_redo.h5 +Time for Undo, Redo (createNode): 22.7840828896 s, 7.9872610569 s +Time for Undo, Redo (createNode): 22.2799329758 s, 7.95833396912 s + +real 1m32.307s +user 1m16.598s +sys 0m15.105s + +i.e. an undo takes 23 milliseconds while a redo takes 8 milliseconds +approximately. + +The fact that undo operations take 3 times more than redo is probably +due to how the action log is implemented. The action log has been +implemented as a Table object, and PyTables has been optimized to read +rows of tables in *forward* direction (the one needed for redo +operations). However, when looking in *backward* direction (needed for +undo operations), the internal cache of PyTables is counterproductive +and makes look-ups quite slow (compared with forward access). +Nevertheless, the code for Undo/Redo has been optimized quite a bit to +smooth this kind of access as much as possible, but with a relative +success. A more definitive optimization should involve getting much +better performance for reading tables in backward direction. That +would be a major task, and can be eventually addressed in the future. + + +Francesc Alted +2005-03-10 diff --git a/bench/widetree.py b/bench/widetree.py new file mode 100644 index 0000000..707c23d --- /dev/null +++ b/bench/widetree.py @@ -0,0 +1,125 @@ +import hotshot +import hotshot.stats + +import unittest +import tempfile +from pathlib import Path +from time import perf_counter as clock + +import tables as tb + +verbose = 0 + + +class WideTreeTestCase(unittest.TestCase): + """Checks for maximum number of childs for a Group.""" + + def test00_Leafs(self): + """Checking creation of large number of leafs (1024) per group. + + Variable 'maxchilds' controls this check. PyTables support up to + 4096 childs per group, but this would take too much memory (up + to 64 MB) for testing purposes (may be we can add a test for big + platforms). A 1024 childs run takes up to 30 MB. A 512 childs + test takes around 25 MB. + + """ + + maxchilds = 1000 + if verbose: + print('\n', '-=' * 30) + print("Running %s.test00_wideTree..." % self.__class__.__name__) + print("Maximum number of childs tested :", maxchilds) + # Open a new empty HDF5 file + #file = tempfile.mktemp(".h5") + file = "test_widetree.h5" + + fileh = tb.open_file(file, mode="w") + if verbose: + print("Children writing progress: ", end=' ') + for child in range(maxchilds): + if verbose: + print("%3d," % (child), end=' ') + a = [1, 1] + fileh.create_group(fileh.root, 'group' + str(child), + "child: %d" % child) + fileh.create_array("/group" + str(child), 'array' + str(child), + a, "child: %d" % child) + if verbose: + print() + # Close the file + fileh.close() + + t1 = clock() + # Open the previous HDF5 file in read-only mode + fileh = tb.open_file(file, mode="r") + print("\nTime spent opening a file with %d groups + %d arrays: " + "%s s" % (maxchilds, maxchilds, clock() - t1)) + if verbose: + print("\nChildren reading progress: ", end=' ') + # Close the file + fileh.close() + # Then, delete the file + # os.remove(file) + + def test01_wideTree(self): + """Checking creation of large number of groups (1024) per group. + + Variable 'maxchilds' controls this check. PyTables support up to + 4096 childs per group, but this would take too much memory (up + to 64 MB) for testing purposes (may be we can add a test for big + platforms). A 1024 childs run takes up to 30 MB. A 512 childs + test takes around 25 MB. + + """ + + maxchilds = 1000 + if verbose: + print('\n', '-=' * 30) + print("Running %s.test00_wideTree..." % self.__class__.__name__) + print("Maximum number of childs tested :", maxchilds) + # Open a new empty HDF5 file + file = tempfile.mktemp(".h5") + #file = "test_widetree.h5" + + fileh = tb.open_file(file, mode="w") + if verbose: + print("Children writing progress: ", end=' ') + for child in range(maxchilds): + if verbose: + print("%3d," % (child), end=' ') + fileh.create_group(fileh.root, 'group' + str(child), + "child: %d" % child) + if verbose: + print() + # Close the file + fileh.close() + + t1 = clock() + # Open the previous HDF5 file in read-only mode + fileh = tb.open_file(file, mode="r") + print("\nTime spent opening a file with %d groups: %s s" % + (maxchilds, clock() - t1)) + # Close the file + fileh.close() + # Then, delete the file + Path(file).unlink() + +#---------------------------------------------------------------------- + + +def suite(): + theSuite = unittest.TestSuite() + theSuite.addTest(unittest.makeSuite(WideTreeTestCase)) + + return theSuite + + +if __name__ == '__main__': + prof = hotshot.Profile("widetree.prof") + benchtime, stones = prof.runcall(unittest.main(defaultTest='suite')) + prof.close() + stats = hotshot.stats.load("widetree.prof") + stats.strip_dirs() + stats.sort_stats('time', 'calls') + stats.print_stats(20) diff --git a/bench/widetree2.py b/bench/widetree2.py new file mode 100644 index 0000000..9ad00e3 --- /dev/null +++ b/bench/widetree2.py @@ -0,0 +1,112 @@ +import unittest + +import tables as tb + +verbose = 0 + + +class Test(tb.IsDescription): + ngroup = tb.Int32Col(pos=1) + ntable = tb.Int32Col(pos=2) + nrow = tb.Int32Col(pos=3) + #string = StringCol(itemsize=500, pos=4) + + +class WideTreeTestCase(unittest.TestCase): + + def test00_Leafs(self): + + # Open a new empty HDF5 file + filename = "test_widetree.h5" + ngroups = 10 + ntables = 300 + nrows = 10 + complevel = 0 + complib = "lzo" + + print("Writing...") + # Open a file in "w"rite mode + fileh = tb.open_file(filename, mode="w", title="PyTables Stress Test") + + for k in range(ngroups): + # Create the group + group = fileh.create_group("/", 'group%04d' % k, "Group %d" % k) + + fileh.close() + + # Now, create the tables + rowswritten = 0 + for k in range(ngroups): + print("Filling tables in group:", k) + fileh = tb.open_file(filename, mode="a", root_uep='group%04d' % k) + # Get the group + group = fileh.root + for j in range(ntables): + # Create a table + table = fileh.create_table(group, 'table%04d' % j, Test, + 'Table%04d' % j, + tb.Filters(complevel, complib), nrows) + # Get the row object associated with the new table + row = table.row + # Fill the table + for i in range(nrows): + row['ngroup'] = k + row['ntable'] = j + row['nrow'] = i + row.append() + + rowswritten += nrows + table.flush() + + # Close the file + fileh.close() + + # read the file + print("Reading...") + rowsread = 0 + for ngroup in range(ngroups): + fileh = tb.open_file(filename, mode="r", root_uep='group%04d' % + ngroup) + # Get the group + group = fileh.root + ntable = 0 + if verbose: + print("Group ==>", group) + for table in fileh.list_nodes(group, 'Table'): + if verbose > 1: + print("Table ==>", table) + print("Max rows in buf:", table.nrowsinbuf) + print("Rows in", table._v_pathname, ":", table.nrows) + print("Buffersize:", table.rowsize * table.nrowsinbuf) + print("MaxTuples:", table.nrowsinbuf) + + nrow = 0 + for row in table: + try: + assert row["ngroup"] == ngroup + assert row["ntable"] == ntable + assert row["nrow"] == nrow + except: + print("Error in group: %d, table: %d, row: %d" % + (ngroup, ntable, nrow)) + print("Record ==>", row) + nrow += 1 + + assert nrow == table.nrows + rowsread += table.nrows + ntable += 1 + + # Close the file (eventually destroy the extended type) + fileh.close() + + +#---------------------------------------------------------------------- +def suite(): + theSuite = unittest.TestSuite() + theSuite.addTest(unittest.makeSuite(WideTreeTestCase)) + + return theSuite + + +if __name__ == '__main__': + unittest.main(defaultTest='suite') diff --git a/bench/woody-pentiumIV.txt b/bench/woody-pentiumIV.txt new file mode 100644 index 0000000..a82baaf --- /dev/null +++ b/bench/woody-pentiumIV.txt @@ -0,0 +1,189 @@ +This is for Debian woody! + +Below are some benchmarking figures obtained while reading and writing +to a file with three tables, each table containing 10000 records. For +reference, the same tests have been repeated using the shelve module +that comes with Python. The tests were conducted on a platform with a +2 GHz AMD Athlon chip, an IDE disk at 4600 rpm, and 256 MB of RAM. + +Version 0.2 + + | medium size records | small size records + | (47 Bytes) | (16 Bytes) + +---------------------------+------------------------------ + | rows/s filesize | rows/s filesize + | write read | write read +------------+---------------------------+------------------------------ + no compress| | + record | 24400 39000 1184 KB | 32600 52600 506 KB + tupla | 17100 81100 1184 KB | 66666 107142 506 KB +------------+---------------------------+------------------------------ + compress | | + record | 22200 37500 494 KB | 31900 51700 94 KB + tupla | 16100 75000 494 KB | 63900 107142 94 KB +------------+---------------------------+------------------------------ + Shelve | 25800 14400 2500 KB | 68200 17000 921 KB + +New version (15-Jan-2003) + + + PyTables pre-0.3 + +Rec length | rows/s | KB/s | rows | filesz | memory | + | write read | write read | | (MB) | (MB) | +------------+-----------------+-----------------+-------+--------+--------+ + 16 B | 31000 166600 | 480 2600 | 3.e4 | 0.49| 6.5 | +------------+-----------------+-----------------+-------+--------+--------+ + 56 B | 17300 136000 | 942 7460 | 3.e4 | 1.7 | 7.2 | +------------+-----------------+-----------------+-------+--------+--------+ + 56 B* | 1560 136000 | 85 7560 | 3.e4 | 1.7 | 7.2 | +------------+-----------------+-----------------+-------+--------+--------+ + 64 B* | 1540 130000 | 96 8152 | 3.e4 | 1.9 | 7.2 | +------------+-----------------+-----------------+-------+--------+--------+ + 550 B* | 879 81100 | 472 43500 | 3.e4 | 19 | 7.2 | +------------+-----------------+-----------------+-------+--------+--------+ + 550 B** | 12000 103000 | 6440 55400 | 3.e5 | 168 | 7.2 | +------------+-----------------+-----------------+-------+--------+--------+ + 550 B** | 15500 81100 | 8350 43500 | 3.e4 | 19 | 7.2 | +------------+-----------------+-----------------+-------+--------+--------+ + 550 B**c| 909 1100 | 490 1081 | 3.e4 | 0.76| 8.5 | +------------+-----------------+-----------------+-------+--------+--------+ + 550 B***| 3600 81100 | 1950 43500 | 3.e4 | 19 | 7.2 | +------------+-----------------+-----------------+-------+--------+--------+ + +* These are figures obtained with a numarray as part of the record +** The numarray record fields are not set in each iteration +*** Some numarray elements of a record field are changed on each iteration +**c Like ** but with compression (level 1) + + +New version (10-March-2003) + + PyTables pre-0.4 + +Rec | rows/s | KB/s | rows | filesz | memory |%CP|%CP +length | write read | write read | | (MB) | (MB) |(w)|(r) +--------+-----------------+-----------------+-------+--------+--------+---+---- + 16 B |434000 469000 | 6800 7300 | 3.e4 | 0.49| 6.5 |100|100 +--------+-----------------+-----------------+-------+--------+--------+---+---- + 16 Bc |326000 435000 | 5100 6800 | 3.e4 | 0.12| 6.5 |100|100 +--------+-----------------+-----------------+-------+--------+--------+---+---- + 16 B |663000 728000 | 10400 11400 | 3.e5 | 4.7 | 7.0 | 99|100 +--------+-----------------+-----------------+-------+--------+--------+---+---- + 16 B |679000 797000 | 10600 12500 | 3.e6 | 46.0 | 10.0 | 98| 98 +--------+-----------------+-----------------+-------+--------+--------+---+---- + 16 Bc |452000 663000 | 7100 10400 | 3.e6 | 9.3 | 10.0 | 98| 98 +--------+-----------------+-----------------+-------+--------+--------+---+---- + 16 B |576000 590000 | 9000 9200 | 3.e7 | 458.0 | 11.0 | 78| 76 +--------+-----------------+-----------------+-------+--------+--------+---+---- + 56 B | 3050 380000 | 163 20700 | 3.e4 | 1.7 | 7.2 | 98|100 +--------+-----------------+-----------------+-------+--------+--------+---+---- + 56 B* |194000 340000 | 10600 18600 | 3.e4 | 1.7 | 7.2 |100|100 +--------+-----------------+-----------------+-------+--------+--------+---+---- + 56 B*c |142000 306000 | 7800 16600 | 3.e4 | 0.3 | 7.2 |100|100 +--------+-----------------+-----------------+-------+--------+--------+---+---- + 56 B* |273600 589000 | 14800 32214 | 3.e5 | 16.0 | 9.0 |100|100 +--------+-----------------+-----------------+-------+--------+--------+---+---- + 56 B*c |184000 425000 | 10070 23362 | 3.e5 | 2.7 | 9.7 |100|100 +--------+-----------------+-----------------+-------+--------+--------+---+---- + 56 B* |203600 649000 | 11100 35500 | 3.e6 | 161.0 | 12.0 | 72| 99 +--------+-----------------+-----------------+-------+--------+--------+---+---- + 56 B* |184000 229000 | 10000 12500 | 1.e7 | 534.0 | 17.0 | 56| 40 +--------+-----------------+-----------------+-------+--------+--------+---+---- + 56 B*np|184000 229000 | 10000 12500 | 1.e7 | 534.0 | 17.0 | 56| 40 +--------+-----------------+-----------------+-------+--------+--------+---+---- +550 B | 2230 143000 | 1195 76600 | 3.e4 | 19 | 9.4 |100|100 +--------+-----------------+-----------------+-------+--------+--------+---+---- +550 B* | 76000 250000 | 40900 134000 | 3.e4 | 19 | 9.4 |100|100 +--------+-----------------+-----------------+-------+--------+--------+---+---- +550 B*c | 13900 30000 | 7400 16100 | 3.e4 | 0.7 | 10.0 | 99| 99 +--------+-----------------+-----------------+-------+--------+--------+---+---- +550 B* | 25400 325000 | 13600 174000 | 3.e5 | 167 | 11.0 | 71| 96 +--------+-----------------+-----------------+-------+--------+--------+---+---- +550 B* | 18700 28000 | 10000 15100 | 6.e5 | 322 | 13.0 | 76| 9 +--------+-----------------+-----------------+-------+--------+--------+---+---- +550 B*c | 7300 21000 | 3900 11300 | 6.e5 | 11 | 17.0 | 98| 99 +--------+-----------------+-----------------+-------+--------+--------+---+---- + +* These are figures obtained with a numarray as part of the record +** The numarray record fields are not set in each iteration +c With compression (level 1) +np No psyco optimizations + + Shelve + +Rec length | rows/s | KB/s | rows | filesz | memory | + | write read | write read | | (MB) | (MB) | +------------+-----------------+-----------------+-------+--------+--------+ + 16 B | 68200 17000 | 1070 266 | 3.e4 | 0.94| 7.2 | +------------+-----------------+-----------------+-------+--------+--------+ + 56 B | 25000 14400 | 1367 784 | 3.e4 | 2.5 | 10.6 | +------------+-----------------+-----------------+-------+--------+--------+ + 56 B* | 2980 2710 | 162 148 | 3.e4 | 7.3 | 33 | +------------+-----------------+-----------------+-------+--------+--------+ + 64 B* | 2900 2700 | 182 168 | 3.e4 | 7.5 | 33 | +------------+-----------------+-----------------+-------+--------+--------+ + 550 B* | 1090 1310 | 590 710 | 3.e4 | 58 | 122 | +------------+-----------------+-----------------+-------+--------+--------+ + 550 B** | 16000 14900 | 2400 1200 | 3.e4 | 2.4 | 10.6 | +------------+-----------------+-----------------+-------+--------+--------+ + 550 B***| 28000 11900 | 2400 1100 | 3.e4 | 2.5 | 10.6 | +------------+-----------------+-----------------+-------+--------+--------+ + +* These are figures obtained with a numarray as part of the record +** The nuamrray records are not set on each iteration +*** Some numarray elements of a record field are changed on each iteration + + + Python cPickle & bsddb3 RECNO with variable length + +Rec | Krows/s | MB/s | Krows | filesz | memory |%CP|%CP +length | write read | write read | | (MB) | (MB) |(w)|(r) +--------+-----------------+-----------------+-------+--------+--------+---+---- + 16 B | 23.0 4.3 | 0.65 0.12 | 30 | 2.3 | 6.0 |100|100 +--------+-----------------+-----------------+-------+--------+--------+---+---- + 16 B | 22.0 4.3 | 0.60 0.12 | 300 | 24 | 25.0 |100|100 +--------+-----------------+-----------------+-------+--------+--------+---+---- + 56 B | 12.3 2.0 | 0.68 0.11 | 30 | 5.8 | 6.2 |100|100 +--------+-----------------+-----------------+-------+--------+--------+---+---- + 56 B | 8.8 2.0 | 0.44 0.11 | 300 | 61 | 6.2 |100|100 +--------+-----------------+-----------------+-------+--------+--------+---+---- + + + Python struct & bsddb3 RECNO with fixed length + +Rec | Krows/s | MB/s | Krows | filesz | memory |%CP|%CP +length | write read | write read | | (MB) | (MB) |(w)|(r) +--------+-----------------+-----------------+-------+--------+--------+---+---- + 16 B | 61 71 | 1.6 1.9 | 30 | 1.0 | 5.0 |100|100 +--------+-----------------+-----------------+-------+--------+--------+---+---- + 16 B | 56 65 | 1.5 1.8 | 300 | 10 | 5.8 |100|100 +--------+-----------------+-----------------+-------+--------+--------+---+---- + 16 B | 51 61 | 1.4 1.6 | 3000 | 100 | 6.1 |100|100 +--------+-----------------+-----------------+-------+--------+--------+---+---- + 56 B | 51 52 | 2.7 2.8 | 30 | 1.8 | 5.8 |100|100 +--------+-----------------+-----------------+-------+--------+--------+---+---- + 56 B | 18 50 | 1.0 2.7 | 300 | 18 | 6.2 |100|100 +--------+-----------------+-----------------+-------+--------+--------+---+---- + 56 B | 16 48 | 0.9 2.6 | 1000 | 61 | 6.5 |100|100 +--------+-----------------+-----------------+-------+--------+--------+---+---- + + + PySqlLite + +Rec | rows/s | KB/s | rows | filesz | memory |%CP|%CP +length | write read | write read | | (MB) | (MB) |(w)|(r) +--------+-----------------+-----------------+-------+--------+--------+---+---- + 16 B | 4290 1400000 | 200 48000 | 3.e4 | 1.4 | 5.0 |100|100 +--------+-----------------+-----------------+-------+--------+--------+---+---- + 16 B | 3660 1030000 | 182 51000 | 3.e5 | 15 | 5.0 |100|100 +--------+-----------------+-----------------+-------+--------+--------+---+---- + 16 B | 3580 230000 | 192 12380 | 6.e6 | 322 | 5.0 |100| 25 +--------+-----------------+-----------------+-------+--------+--------+---+---- + 56 B | 2990 882000 | 250 76000 | 3.e4 | 2.6 | 5.0 |100|100 +--------+-----------------+-----------------+-------+--------+--------+---+---- + 56 B | 2900 857000 | 270 80000 | 3.e5 | 28 | 5.0 |100|100 +--------+-----------------+-----------------+-------+--------+--------+---+---- + 56 B | 2900 120000 | 302 13100 | 3.e6 | 314 | 5.0 |100| 11 +--------+-----------------+-----------------+-------+--------+--------+---+---- + diff --git a/ci/github/get_hdf5_if_needed.sh b/ci/github/get_hdf5_if_needed.sh new file mode 100755 index 0000000..5a349f8 --- /dev/null +++ b/ci/github/get_hdf5_if_needed.sh @@ -0,0 +1,43 @@ +#!/bin/bash +# vendored from https://github.com/h5py/h5py/blob/master/ci/get_hdf5_if_needed.sh + +set -e + +if [ -z ${HDF5_DIR+x} ]; then + echo "Using OS HDF5" +else + echo "Using downloaded HDF5" + if [ -z ${HDF5_MPI+x} ]; then + echo "Building serial" + EXTRA_MPI_FLAGS='' + else + echo "Building with MPI" + EXTRA_MPI_FLAGS="--enable-parallel --enable-shared" + fi + + if [[ "$OSTYPE" == "darwin"* ]]; then + lib_name=libhdf5.dylib + else + lib_name=libhdf5.so + fi + + if [ -f $HDF5_DIR/lib/$lib_name ]; then + echo "using cached build" + else + pushd /tmp + # Remove trailing .*, to get e.g. '1.12' ↓ + curl -fsSLO "https://www.hdfgroup.org/ftp/HDF5/releases/hdf5-${HDF5_VERSION%.*}/hdf5-$HDF5_VERSION/src/hdf5-$HDF5_VERSION.tar.gz" + tar -xzvf hdf5-$HDF5_VERSION.tar.gz + pushd hdf5-$HDF5_VERSION + chmod u+x autogen.sh + if [[ "${HDF5_VERSION%.*}" = "1.12" ]]; then + ./configure --prefix $HDF5_DIR $EXTRA_MPI_FLAGS --enable-build-mode=production + else + ./configure --prefix $HDF5_DIR $EXTRA_MPI_FLAGS + fi + make -j $(nproc) + make install + popd + popd + fi +fi diff --git a/contrib/README b/contrib/README new file mode 100644 index 0000000..b3f8bdb --- /dev/null +++ b/contrib/README @@ -0,0 +1,10 @@ +In these directories you can find some scripts contributed by PyTables +users. If you have any suggestion on them, please contact the original +authors. + +nctoh5.py: Converts netcdf files to hdf5. You can find an improved nctoh5 +utility included in utils/ directory +Author: Jeff Whitaker + +make_hdf.py: Converts general python data structures into hdf5 +Author: John Nielsen diff --git a/contrib/make_hdf.py b/contrib/make_hdf.py new file mode 100644 index 0000000..150c810 --- /dev/null +++ b/contrib/make_hdf.py @@ -0,0 +1,354 @@ +#!/usr/bin/env python + +import pickle +from time import perf_counter as clock + +import tables as tb + + +def is_scalar(item): + try: + iter(item) + # could be a string + try: + item[:0] + '' # check for string + return 'str' + except: + return 0 + except: + return 'notstr' + + +def is_dict(item): + try: + item.items() + return 1 + except: + return 0 + + +def make_col(row_type, row_name, row_item, str_len): + '''for strings it will always make at least 80 char or twice mac char size''' + set_len = 80 + if str_len: + if 2 * str_len > set_len: + set_len = 2 * str_len + row_type[row_name] = tb.StringCol(set_len) + else: + type_matrix = { + int: tb.Int32Col(), + float: tb.Float32Col(), + } + row_type[row_name] = type_matrix[type(row_item)] + + +def make_row(data): + row_type = {} + scalar_type = is_scalar(data) + if scalar_type: + if scalar_type == 'str': + make_col(row_type, 'scalar', data, len(data)) + else: + make_col(row_type, 'scalar', data, 0) + else: # it is a list-like + the_type = is_scalar(data[0]) + if the_type == 'str': + # get max length + the_max = 0 + for i in data: + if len(i) > the_max: + the_max = len(i) + make_col(row_type, 'col', data[0], the_max) + elif the_type: + make_col(row_type, 'col', data[0], 0) + else: # list within the list, make many columns + make_col(row_type, 'col_depth', 0, 0) + count = 0 + for col in data: + the_type = is_scalar(col[0]) + if the_type == 'str': + # get max length + the_max = 0 + for i in data: + if len(i) > the_max: + the_max = len(i) + make_col(row_type, 'col_' + str(count), col[0], the_max) + elif the_type: + make_col(row_type, 'col_' + str(count), col[0], 0) + else: + raise ValueError('too many nested levels of lists') + count += 1 + return row_type + + +def add_table(fileh, group_obj, data, table_name): + # figure out if it is a list of lists or a single list + # get types of columns + row_type = make_row(data) + table1 = fileh.create_table(group_obj, table_name, row_type, 'H') + row = table1.row + + if is_scalar(data): + row['scalar'] = data + row.append() + else: + if is_scalar(data[0]): + for i in data: + row['col'] = i + row.append() + else: + count = 0 + for col in data: + row['col_depth'] = len(col) + for the_row in col: + if is_scalar(the_row): + row['col_' + str(count)] = the_row + row.append() + else: + raise ValueError('too many levels of lists') + count += 1 + table1.flush() + + +def add_cache(fileh, cache): + group_name = 'pytables_cache_v0'; + table_name = 'cache0' + root = fileh.root + group_obj = fileh.create_group(root, group_name) + cache_str = pickle.dumps(cache, 0).decode() + cache_str = cache_str.replace('\n', chr(1)) + cache_pieces = [] + while cache_str: + cache_part = cache_str[:8000]; + cache_str = cache_str[8000:] + if cache_part: + cache_pieces.append(cache_part) + row_type = {} + row_type['col_0'] = tb.StringCol(8000) + # + table_cache = fileh.create_table(group_obj, table_name, row_type, 'H') + for piece in cache_pieces: + print(len(piece)) + table_cache.row['col_0'] = piece + table_cache.row.append() + table_cache.flush() + + +def save2(hdf_file, data): + fileh = tb.open_file(hdf_file, mode='w', title='logon history') + root = fileh.root; + cache_root = cache = {} + root_path = root._v_pathname; + root = 0 + stack = [(root_path, data, cache)] + table_num = 0 + count = 0 + + while stack: + (group_obj_path, data, cache) = stack.pop() + for grp_name in data: + count += 1 + cache[grp_name] = {} + new_group_obj = fileh.create_group(group_obj_path, grp_name) + new_path = new_group_obj._v_pathname + # if dict, you have a bunch of groups + if is_dict(data[grp_name]): # {'mother':[22,23,24]} + stack.append((new_path, data[grp_name], cache[grp_name])) + # you have a table + else: + # data[grp_name]=[110,130,140],[1,2,3] + add_table(fileh, new_path, data[grp_name], f'tbl_{table_num}') + table_num += 1 + + add_cache(fileh, cache_root) + fileh.close() + + +class Hdf_dict(dict): + def __init__(self, hdf_file, hdf_dict=None, stack=None): + if hdf_dict is None: + hdf_dict = {} + if stack is None: + stack = [] + self.hdf_file = hdf_file + self.stack = stack + if stack: + self.hdf_dict = hdf_dict + else: + self.hdf_dict = self.get_cache() + self.cur_dict = self.hdf_dict + + def get_cache(self): + fileh = tb.open_file(self.hdf_file, root_uep='/pytables_cache_v0') + table = fileh.root.cache0 + total = [] + print('reading') + begin = clock() + for i in table.iterrows(): + total.append(i['col_0'].decode()) + total = ''.join(total) + total = total.replace(chr(1), '\n') + print('loaded cache len=', len(total), clock() - begin) + begin = clock() + a = pickle.loads(total.encode()) + print('cache', clock() - begin) + return a + + def has_key(self, k): + return k in self.cur_dict + + def keys(self): + return self.cur_dict.keys() + + def get(self, key, default=None): + try: + return self.__getitem__(key) + except: + return default + + def items(self): + return list(self.cur_dict.items()) + + def values(self): + return list(self.cur_dict.values()) + + def __len__(self): + return len(self.cur_dict) + + def __getitem__(self, k): + if k in self.cur_dict: + # now check if k has any data + if self.cur_dict[k]: + new_stack = self.stack[:] + new_stack.append(k) + return Hdf_dict(self.hdf_file, hdf_dict=self.cur_dict[k], + stack=new_stack) + else: + new_stack = self.stack[:] + new_stack.append(k) + fileh = tb.open_file(self.hdf_file, + root_uep='/'.join(new_stack)) + for table in fileh.root: + try: + for item in table['scalar']: + return item + except: + # otherwise they stored a list of data + try: + return [item for item in table['col']] + except: + cur_column = [] + total_columns = [] + col_num = 0 + cur_row = 0 + num_rows = 0 + for row in table: + if not num_rows: + num_rows = row['col_depth'] + if cur_row == num_rows: + cur_row = num_rows = 0 + col_num += 1 + total_columns.append(cur_column) + cur_column = [] + cur_column.append(row['col_' + str(col_num)]) + cur_row += 1 + total_columns.append(cur_column) + return total_columns + else: + raise KeyError(k) + + def iterkeys(self): + yield from self.keys() + + def __iter__(self): + return self.iterkeys() + + def itervalues(self): + for k in self.iterkeys(): + v = self.__getitem__(k) + yield v + + def iteritems(self): + # yield children + for k in self.iterkeys(): + v = self.__getitem__(k) + yield (k, v) + + def __repr__(self): + return '{Hdf dict}' + + def __str__(self): + return self.__repr__() + + ##### + def setdefault(self, key, default=None): + try: + return self.__getitem__(key) + except: + self.__setitem__(key) + return default + + def update(self, d): + for k, v in d.items(): + self.__setitem__(k, v) + + def popitem(self): + try: + k, v = next(self.items()) + del self[k] + return k, v + except StopIteration: + raise KeyError("Hdf Dict is empty") + + def __setitem__(self, key, value): + raise NotImplementedError + + def __delitem__(self, key): + raise NotImplementedError + + def __hash__(self): + raise TypeError("Hdf dict bjects are unhashable") + + +if __name__ == '__main__': + + def write_small(file=''): + data1 = { + 'fred': ['a', 'b', 'c'], + 'barney': [[9110, 9130, 9140], [91, 92, 93]], + 'wilma': { + 'mother': {'pebbles': [22, 23, 24], 'bambam': [67, 68, 69]}} + } + + print('saving') + save2(file, data1) + print('saved') + + + def read_small(file=''): + a = Hdf_dict(file) + print(a['wilma']) + b = a['wilma'] + for i in b: + print(i) + + print(a.keys()) + print('has fred', bool('fred' in a)) + print('length a', len(a)) + print('get', a.get('fred'), a.get('not here')) + print('wilma keys', a['wilma'].keys()) + print('barney', a['barney']) + print('get items') + print(a.items()) + for i in a.items(): + print('item', i) + for i in a.values(): + print(i) + + + a = input('enter y to write out test file to test.hdf ') + if a.strip() == 'y': + print('writing') + write_small('test.hdf') + print('reading') + read_small('test.hdf') diff --git a/contrib/nctoh5.py b/contrib/nctoh5.py new file mode 100755 index 0000000..a88c37d --- /dev/null +++ b/contrib/nctoh5.py @@ -0,0 +1,51 @@ +#!/usr/bin/env python + +""" +convert netCDF file to HDF5 using Scientific.IO.NetCDF and PyTables. +Jeff Whitaker + +This requires Scientific from +http://starship.python.net/~hinsen/ScientificPython + +""" +import sys +from Scientific.IO import NetCDF +import tables as tb +# open netCDF file +ncfile = NetCDF.NetCDFFile(sys.argv[1], mode = "r") +# open h5 file. +h5file = tb.openFile(sys.argv[2], mode = "w") +# loop over variables in netCDF file. +for varname in ncfile.variables.keys(): + var = ncfile.variables[varname] + vardims = list(var.dimensions) + vardimsizes = [ncfile.dimensions[vardim] for vardim in vardims] + # use long_name for title. + if hasattr(var, 'long_name'): + title = var.long_name + else: # or, just use some bogus title. + title = varname + ' array' + # if variable has unlimited dimension or has rank>1, + # make it enlargeable (with zlib compression). + if vardimsizes[0] == None or len(vardimsizes) > 1: + vardimsizes[0] = 0 + vardata = h5file.createEArray(h5file.root, varname, + tb.Atom(shape=tuple(vardimsizes), dtype=var.typecode(),), + title, filters=tb.Filters(complevel=6, complib='zlib')) + # write data to enlargeable array on record at a time. + # (so the whole array doesn't have to be kept in memory). + for n in range(var.shape[0]): + vardata.append(var[n:n+1]) + # or else, create regular array write data to it all at once. + else: + vardata=h5file.createArray(h5file.root, varname, var[:], title) + # set variable attributes. + for key, val in var.__dict__.iteritems(): + setattr(vardata.attrs, key, val) + setattr(vardata.attrs, 'dimensions', tuple(vardims)) +# set global (file) attributes. +for key, val in ncfile.__dict__.iteritems(): + setattr(h5file.root._v_attrs, key, val) +# Close the file +h5file.close() + diff --git a/cpuinfo.py b/cpuinfo.py new file mode 100644 index 0000000..5f9f250 --- /dev/null +++ b/cpuinfo.py @@ -0,0 +1,2206 @@ +#!/usr/bin/env python +# -*- coding: UTF-8 -*- + +# Copyright (c) 2014-2018, Matthew Brennan Jones +# Py-cpuinfo gets CPU info with pure Python 2 & 3 +# It uses the MIT License +# It is hosted at: https://github.com/workhorsy/py-cpuinfo +# +# Permission is hereby granted, free of charge, to any person obtaining +# a copy of this software and associated documentation files (the +# "Software"), to deal in the Software without restriction, including +# without limitation the rights to use, copy, modify, merge, publish, +# distribute, sublicense, and/or sell copies of the Software, and to +# permit persons to whom the Software is furnished to do so, subject to +# the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +CPUINFO_VERSION = (4, 0, 0) + +import os, sys +import glob +import re +import time +import platform +import multiprocessing +import ctypes +import pickle +import base64 +import subprocess + +try: + import _winreg as winreg +except ImportError as err: + try: + import winreg + except ImportError as err: + pass + +# Load hacks for Windows +if platform.system().lower() == 'windows': + # Monkey patch multiprocessing's Popen to fork properly on Windows Pyinstaller + # https://github.com/pyinstaller/pyinstaller/wiki/Recipe-Multiprocessing + try: + import multiprocessing.popen_spawn_win32 as forking + except ImportError as err: + try: + import multiprocessing.popen_fork as forking + except ImportError as err: + import multiprocessing.forking as forking + + class _Popen(forking.Popen): + def __init__(self, *args, **kw): + if hasattr(sys, 'frozen'): + # We have to set original _MEIPASS2 value from sys._MEIPASS + # to get --onefile mode working. + os.putenv('_MEIPASS2', sys._MEIPASS) + try: + super().__init__(*args, **kw) + finally: + if hasattr(sys, 'frozen'): + # On some platforms (e.g. AIX) 'os.unsetenv()' is not + # available. In those cases we cannot delete the variable + # but only set it to the empty string. The bootloader + # can handle this case. + if hasattr(os, 'unsetenv'): + os.unsetenv('_MEIPASS2') + else: + os.putenv('_MEIPASS2', '') + + forking.Popen = _Popen + +class DataSource(object): + bits = platform.architecture()[0] + cpu_count = multiprocessing.cpu_count() + is_windows = platform.system().lower() == 'windows' + raw_arch_string = platform.machine() + can_cpuid = True + + @staticmethod + def has_proc_cpuinfo(): + return os.path.exists('/proc/cpuinfo') + + @staticmethod + def has_dmesg(): + return len(program_paths('dmesg')) > 0 + + @staticmethod + def has_var_run_dmesg_boot(): + uname = platform.system().strip().strip('"').strip("'").strip().lower() + return 'linux' in uname and os.path.exists('/var/run/dmesg.boot') + + @staticmethod + def has_cpufreq_info(): + return len(program_paths('cpufreq-info')) > 0 + + @staticmethod + def has_sestatus(): + return len(program_paths('sestatus')) > 0 + + @staticmethod + def has_sysctl(): + return len(program_paths('sysctl')) > 0 + + @staticmethod + def has_isainfo(): + return len(program_paths('isainfo')) > 0 + + @staticmethod + def has_kstat(): + return len(program_paths('kstat')) > 0 + + @staticmethod + def has_sysinfo(): + return len(program_paths('sysinfo')) > 0 + + @staticmethod + def has_lscpu(): + return len(program_paths('lscpu')) > 0 + + @staticmethod + def has_ibm_pa_features(): + return len(program_paths('lsprop')) > 0 + + @staticmethod + def has_wmic(): + returncode, output = run_and_get_stdout(['wmic', 'os', 'get', 'Version']) + return returncode == 0 and len(output) > 0 + + @staticmethod + def cat_proc_cpuinfo(): + return run_and_get_stdout(['cat', '/proc/cpuinfo']) + + @staticmethod + def cpufreq_info(): + return run_and_get_stdout(['cpufreq-info']) + + @staticmethod + def sestatus_allow_execheap(): + return run_and_get_stdout(['sestatus', '-b'], ['grep', '-i', '"allow_execheap"'])[1].strip().lower().endswith('on') + + @staticmethod + def sestatus_allow_execmem(): + return run_and_get_stdout(['sestatus', '-b'], ['grep', '-i', '"allow_execmem"'])[1].strip().lower().endswith('on') + + @staticmethod + def dmesg_a(): + return run_and_get_stdout(['dmesg', '-a']) + + @staticmethod + def cat_var_run_dmesg_boot(): + return run_and_get_stdout(['cat', '/var/run/dmesg.boot']) + + @staticmethod + def sysctl_machdep_cpu_hw_cpufrequency(): + return run_and_get_stdout(['sysctl', 'machdep.cpu', 'hw.cpufrequency']) + + @staticmethod + def isainfo_vb(): + return run_and_get_stdout(['isainfo', '-vb']) + + @staticmethod + def kstat_m_cpu_info(): + return run_and_get_stdout(['kstat', '-m', 'cpu_info']) + + @staticmethod + def sysinfo_cpu(): + return run_and_get_stdout(['sysinfo', '-cpu']) + + @staticmethod + def lscpu(): + return run_and_get_stdout(['lscpu']) + + @staticmethod + def ibm_pa_features(): + ibm_features = glob.glob('/proc/device-tree/cpus/*/ibm,pa-features') + if ibm_features: + return run_and_get_stdout(['lsprop', ibm_features[0]]) + + @staticmethod + def wmic_cpu(): + return run_and_get_stdout(['wmic', 'cpu', 'get', 'Name,CurrentClockSpeed,L2CacheSize,L3CacheSize,Description,Caption,Manufacturer', '/format:list']) + + @staticmethod + def winreg_processor_brand(): + key = winreg.OpenKey(winreg.HKEY_LOCAL_MACHINE, r"Hardware\Description\System\CentralProcessor\0") + processor_brand = winreg.QueryValueEx(key, "ProcessorNameString")[0] + winreg.CloseKey(key) + return processor_brand + + @staticmethod + def winreg_vendor_id(): + key = winreg.OpenKey(winreg.HKEY_LOCAL_MACHINE, r"Hardware\Description\System\CentralProcessor\0") + vendor_id = winreg.QueryValueEx(key, "VendorIdentifier")[0] + winreg.CloseKey(key) + return vendor_id + + @staticmethod + def winreg_raw_arch_string(): + key = winreg.OpenKey(winreg.HKEY_LOCAL_MACHINE, r"SYSTEM\CurrentControlSet\Control\Session Manager\Environment") + raw_arch_string = winreg.QueryValueEx(key, "PROCESSOR_ARCHITECTURE")[0] + winreg.CloseKey(key) + return raw_arch_string + + @staticmethod + def winreg_hz_actual(): + key = winreg.OpenKey(winreg.HKEY_LOCAL_MACHINE, r"Hardware\Description\System\CentralProcessor\0") + hz_actual = winreg.QueryValueEx(key, "~Mhz")[0] + winreg.CloseKey(key) + hz_actual = to_hz_string(hz_actual) + return hz_actual + + @staticmethod + def winreg_feature_bits(): + key = winreg.OpenKey(winreg.HKEY_LOCAL_MACHINE, r"Hardware\Description\System\CentralProcessor\0") + feature_bits = winreg.QueryValueEx(key, "FeatureSet")[0] + winreg.CloseKey(key) + return feature_bits + +def obj_to_b64(thing): + a = thing + b = pickle.dumps(a) + c = base64.b64encode(b) + d = c.decode('utf8') + return d + +def b64_to_obj(thing): + try: + a = base64.b64decode(thing) + b = pickle.loads(a) + return b + except: + return {} + +def run_and_get_stdout(command, pipe_command=None): + if not pipe_command: + p1 = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE) + output = p1.communicate()[0] + output = output.decode(encoding='UTF-8') + return p1.returncode, output + else: + p1 = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE) + p2 = subprocess.Popen(pipe_command, stdin=p1.stdout, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + p1.stdout.close() + output = p2.communicate()[0] + output = output.decode(encoding='UTF-8') + return p2.returncode, output + + +def program_paths(program_name): + paths = [] + exts = filter(None, os.environ.get('PATHEXT', '').split(os.pathsep)) + path = os.environ['PATH'] + for p in os.environ['PATH'].split(os.pathsep): + p = os.path.join(p, program_name) + if os.access(p, os.X_OK): + paths.append(p) + for e in exts: + pext = p + e + if os.access(pext, os.X_OK): + paths.append(pext) + return paths + +def _get_field_actual(cant_be_number, raw_string, field_names): + for line in raw_string.splitlines(): + for field_name in field_names: + field_name = field_name.lower() + if ':' in line: + left, right = line.split(':', 1) + left = left.strip().lower() + right = right.strip() + if left == field_name and len(right) > 0: + if cant_be_number: + if not right.isdigit(): + return right + else: + return right + + return None + +def _get_field(cant_be_number, raw_string, convert_to, default_value, *field_names): + retval = _get_field_actual(cant_be_number, raw_string, field_names) + + # Convert the return value + if retval and convert_to: + try: + retval = convert_to(retval) + except: + retval = default_value + + # Return the default if there is no return value + if retval is None: + retval = default_value + + return retval + +def _get_hz_string_from_brand(processor_brand): + # Just return 0 if the processor brand does not have the Hz + if not 'hz' in processor_brand.lower(): + return (1, '0.0') + + hz_brand = processor_brand.lower() + scale = 1 + + if hz_brand.endswith('mhz'): + scale = 6 + elif hz_brand.endswith('ghz'): + scale = 9 + if '@' in hz_brand: + hz_brand = hz_brand.split('@')[1] + else: + hz_brand = hz_brand.rsplit(None, 1)[1] + + hz_brand = hz_brand.rstrip('mhz').rstrip('ghz').strip() + hz_brand = to_hz_string(hz_brand) + + return (scale, hz_brand) + +def to_friendly_hz(ticks, scale): + # Get the raw Hz as a string + left, right = to_raw_hz(ticks, scale) + ticks = '{0}.{1}'.format(left, right) + + # Get the location of the dot, and remove said dot + dot_index = ticks.index('.') + ticks = ticks.replace('.', '') + + # Get the Hz symbol and scale + symbol = "Hz" + scale = 0 + if dot_index > 9: + symbol = "GHz" + scale = 9 + elif dot_index > 6: + symbol = "MHz" + scale = 6 + elif dot_index > 3: + symbol = "KHz" + scale = 3 + + # Get the Hz with the dot at the new scaled point + ticks = '{0}.{1}'.format(ticks[:-scale-1], ticks[-scale-1:]) + + # Format the ticks to have 4 numbers after the decimal + # and remove any superfluous zeroes. + ticks = '{0:.4f} {1}'.format(float(ticks), symbol) + ticks = ticks.rstrip('0') + + return ticks + +def to_raw_hz(ticks, scale): + # Scale the numbers + ticks = ticks.lstrip('0') + old_index = ticks.index('.') + ticks = ticks.replace('.', '') + ticks = ticks.ljust(scale + old_index+1, '0') + new_index = old_index + scale + ticks = '{0}.{1}'.format(ticks[:new_index], ticks[new_index:]) + left, right = ticks.split('.') + left, right = int(left), int(right) + return (left, right) + +def to_hz_string(ticks): + # Convert to string + ticks = '{0}'.format(ticks) + + # Add decimal if missing + if '.' not in ticks: + ticks = '{0}.0'.format(ticks) + + # Remove trailing zeros + ticks = ticks.rstrip('0') + + # Add one trailing zero for empty right side + if ticks.endswith('.'): + ticks = '{0}0'.format(ticks) + + return ticks + +def to_friendly_bytes(input): + if not input: + return input + input = "{0}".format(input) + + formats = { + r"^[0-9]+B$" : 'B', + r"^[0-9]+K$" : 'KB', + r"^[0-9]+M$" : 'MB', + r"^[0-9]+G$" : 'GB' + } + + for pattern, friendly_size in formats.items(): + if re.match(pattern, input): + return "{0} {1}".format(input[ : -1].strip(), friendly_size) + + return input + +def _parse_cpu_string(cpu_string): + # Get location of fields at end of string + fields_index = cpu_string.find('(', cpu_string.find('@')) + #print(fields_index) + + # Processor Brand + processor_brand = cpu_string + if fields_index != -1: + processor_brand = cpu_string[0 : fields_index].strip() + #print('processor_brand: ', processor_brand) + + fields = None + if fields_index != -1: + fields = cpu_string[fields_index : ] + #print('fields: ', fields) + + # Hz + scale, hz_brand = _get_hz_string_from_brand(processor_brand) + + # Various fields + vendor_id, stepping, model, family = (None, None, None, None) + if fields: + try: + fields = fields.rsplit('(', 1)[1].split(')')[0].split(',') + fields = [f.strip().lower() for f in fields] + fields = [f.split(':') for f in fields] + fields = [{f[0].strip() : f[1].strip()} for f in fields] + #print('fields: ', fields) + for field in fields: + name = list(field.keys())[0] + value = list(field.values())[0] + #print('name:{0}, value:{1}'.format(name, value)) + if name == 'origin': + vendor_id = value.strip('"') + elif name == 'stepping': + stepping = int(value.lstrip('0x'), 16) + elif name == 'model': + model = int(value.lstrip('0x'), 16) + elif name in ['fam', 'family']: + family = int(value.lstrip('0x'), 16) + except: + #raise + pass + + return (processor_brand, hz_brand, scale, vendor_id, stepping, model, family) + +def _parse_dmesg_output(output): + try: + # Get all the dmesg lines that might contain a CPU string + lines = output.split(' CPU0:')[1:] + \ + output.split(' CPU1:')[1:] + \ + output.split(' CPU:')[1:] + \ + output.split('\nCPU0:')[1:] + \ + output.split('\nCPU1:')[1:] + \ + output.split('\nCPU:')[1:] + lines = [l.split('\n')[0].strip() for l in lines] + + # Convert the lines to CPU strings + cpu_strings = [_parse_cpu_string(l) for l in lines] + + # Find the CPU string that has the most fields + best_string = None + highest_count = 0 + for cpu_string in cpu_strings: + count = sum([n is not None for n in cpu_string]) + if count > highest_count: + highest_count = count + best_string = cpu_string + + # If no CPU string was found, return {} + if not best_string: + return {} + + processor_brand, hz_actual, scale, vendor_id, stepping, model, family = best_string + + # Origin + if ' Origin=' in output: + fields = output[output.find(' Origin=') : ].split('\n')[0] + fields = fields.strip().split() + fields = [n.strip().split('=') for n in fields] + fields = [{n[0].strip().lower() : n[1].strip()} for n in fields] + #print('fields: ', fields) + for field in fields: + name = list(field.keys())[0] + value = list(field.values())[0] + #print('name:{0}, value:{1}'.format(name, value)) + if name == 'origin': + vendor_id = value.strip('"') + elif name == 'stepping': + stepping = int(value.lstrip('0x'), 16) + elif name == 'model': + model = int(value.lstrip('0x'), 16) + elif name in ['fam', 'family']: + family = int(value.lstrip('0x'), 16) + #print('FIELDS: ', (vendor_id, stepping, model, family)) + + # Features + flag_lines = [] + for category in [' Features=', ' Features2=', ' AMD Features=', ' AMD Features2=']: + if category in output: + flag_lines.append(output.split(category)[1].split('\n')[0]) + + flags = [] + for line in flag_lines: + line = line.split('<')[1].split('>')[0].lower() + for flag in line.split(','): + flags.append(flag) + flags.sort() + + # Convert from GHz/MHz string to Hz + scale, hz_advertised = _get_hz_string_from_brand(processor_brand) + + info = { + 'vendor_id' : vendor_id, + 'brand' : processor_brand, + + 'stepping' : stepping, + 'model' : model, + 'family' : family, + 'flags' : flags + } + + if hz_advertised and hz_advertised != '0.0': + info['hz_advertised'] = to_friendly_hz(hz_advertised, scale) + info['hz_actual'] = to_friendly_hz(hz_actual, scale) + + if hz_advertised and hz_advertised != '0.0': + info['hz_advertised_raw'] = to_raw_hz(hz_advertised, scale) + info['hz_actual_raw'] = to_raw_hz(hz_actual, scale) + + return {k: v for k, v in info.items() if v} + except: + #raise + pass + + return {} + +def parse_arch(raw_arch_string): + arch, bits = None, None + raw_arch_string = raw_arch_string.lower() + + # X86 + if re.match('^i\d86$|^x86$|^x86_32$|^i86pc$|^ia32$|^ia-32$|^bepc$', raw_arch_string): + arch = 'X86_32' + bits = 32 + elif re.match('^x64$|^x86_64$|^x86_64t$|^i686-64$|^amd64$|^ia64$|^ia-64$', raw_arch_string): + arch = 'X86_64' + bits = 64 + # ARM + elif re.match('^armv8-a|aarch64$', raw_arch_string): + arch = 'ARM_8' + bits = 64 + elif re.match('^armv7$|^armv7[a-z]$|^armv7-[a-z]$|^armv6[a-z]$', raw_arch_string): + arch = 'ARM_7' + bits = 32 + elif re.match('^armv8$|^armv8[a-z]$|^armv8-[a-z]$', raw_arch_string): + arch = 'ARM_8' + bits = 32 + # PPC + elif re.match('^ppc32$|^prep$|^pmac$|^powermac$', raw_arch_string): + arch = 'PPC_32' + bits = 32 + elif re.match('^powerpc$|^ppc64$|^ppc64le$', raw_arch_string): + arch = 'PPC_64' + bits = 64 + # SPARC + elif re.match('^sparc32$|^sparc$', raw_arch_string): + arch = 'SPARC_32' + bits = 32 + elif re.match('^sparc64$|^sun4u$|^sun4v$', raw_arch_string): + arch = 'SPARC_64' + bits = 64 + + return (arch, bits) + +def is_bit_set(reg, bit): + mask = 1 << bit + is_set = reg & mask > 0 + return is_set + + +class CPUID(object): + def __init__(self): + self.prochandle = None + + # Figure out if SE Linux is on and in enforcing mode + self.is_selinux_enforcing = False + + # Just return if the SE Linux Status Tool is not installed + if not DataSource.has_sestatus(): + return + + # Figure out if we can execute heap and execute memory + can_selinux_exec_heap = DataSource.sestatus_allow_execheap() + can_selinux_exec_memory = DataSource.sestatus_allow_execmem() + self.is_selinux_enforcing = (not can_selinux_exec_heap or not can_selinux_exec_memory) + + def _asm_func(self, restype=None, argtypes=(), byte_code=[]): + byte_code = bytes.join(b'', byte_code) + address = None + + if DataSource.is_windows: + # Allocate a memory segment the size of the byte code, and make it executable + size = len(byte_code) + # Alloc at least 1 page to ensure we own all pages that we want to change protection on + if size < 0x1000: size = 0x1000 + MEM_COMMIT = ctypes.c_ulong(0x1000) + PAGE_READWRITE = ctypes.c_ulong(0x4) + pfnVirtualAlloc = ctypes.windll.kernel32.VirtualAlloc + pfnVirtualAlloc.restype = ctypes.c_void_p + address = pfnVirtualAlloc(None, ctypes.c_size_t(size), MEM_COMMIT, PAGE_READWRITE) + if not address: + raise Exception("Failed to VirtualAlloc") + + # Copy the byte code into the memory segment + memmove = ctypes.CFUNCTYPE(ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_size_t)(ctypes._memmove_addr) + if memmove(address, byte_code, size) < 0: + raise Exception("Failed to memmove") + + # Enable execute permissions + PAGE_EXECUTE = ctypes.c_ulong(0x10) + old_protect = ctypes.c_ulong(0) + pfnVirtualProtect = ctypes.windll.kernel32.VirtualProtect + res = pfnVirtualProtect(ctypes.c_void_p(address), ctypes.c_size_t(size), PAGE_EXECUTE, ctypes.byref(old_protect)) + if not res: + raise Exception("Failed VirtualProtect") + + # Flush Instruction Cache + # First, get process Handle + if not self.prochandle: + pfnGetCurrentProcess = ctypes.windll.kernel32.GetCurrentProcess + pfnGetCurrentProcess.restype = ctypes.c_void_p + self.prochandle = ctypes.c_void_p(pfnGetCurrentProcess()) + # Actually flush cache + res = ctypes.windll.kernel32.FlushInstructionCache(self.prochandle, ctypes.c_void_p(address), ctypes.c_size_t(size)) + if not res: + raise Exception("Failed FlushInstructionCache") + else: + # Allocate a memory segment the size of the byte code + size = len(byte_code) + pfnvalloc = ctypes.pythonapi.valloc + pfnvalloc.restype = ctypes.c_void_p + address = pfnvalloc(ctypes.c_size_t(size)) + if not address: + raise Exception("Failed to valloc") + + # Mark the memory segment as writeable only + if not self.is_selinux_enforcing: + WRITE = 0x2 + if ctypes.pythonapi.mprotect(ctypes.c_void_p(address), size, WRITE) < 0: + raise Exception("Failed to mprotect") + + # Copy the byte code into the memory segment + if ctypes.pythonapi.memmove(ctypes.c_void_p(address), byte_code, ctypes.c_size_t(size)) < 0: + raise Exception("Failed to memmove") + + # Mark the memory segment as writeable and executable only + if not self.is_selinux_enforcing: + WRITE_EXECUTE = 0x2 | 0x4 + if ctypes.pythonapi.mprotect(ctypes.c_void_p(address), size, WRITE_EXECUTE) < 0: + raise Exception("Failed to mprotect") + + # Cast the memory segment into a function + functype = ctypes.CFUNCTYPE(restype, *argtypes) + fun = functype(address) + return fun, address + + def _run_asm(self, *byte_code): + # Convert the byte code into a function that returns an int + restype = ctypes.c_uint32 + argtypes = () + func, address = self._asm_func(restype, argtypes, byte_code) + + # Call the byte code like a function + retval = func() + + byte_code = bytes.join(b'', byte_code) + size = ctypes.c_size_t(len(byte_code)) + + # Free the function memory segment + if DataSource.is_windows: + MEM_RELEASE = ctypes.c_ulong(0x8000) + ctypes.windll.kernel32.VirtualFree(ctypes.c_void_p(address), ctypes.c_size_t(0), MEM_RELEASE) + else: + # Remove the executable tag on the memory + READ_WRITE = 0x1 | 0x2 + if ctypes.pythonapi.mprotect(ctypes.c_void_p(address), size, READ_WRITE) < 0: + raise Exception("Failed to mprotect") + + ctypes.pythonapi.free(ctypes.c_void_p(address)) + + return retval + + # FIXME: We should not have to use different instructions to + # set eax to 0 or 1, on 32bit and 64bit machines. + def _zero_eax(self): + return ( + b"\x31\xC0" # xor eax,eax + ) + + def _zero_ecx(self): + return ( + b"\x31\xC9" # xor ecx,ecx + ) + def _one_eax(self): + return ( + b"\xB8\x01\x00\x00\x00" # mov eax,0x1" + ) + + # http://en.wikipedia.org/wiki/CPUID#EAX.3D0:_Get_vendor_ID + def get_vendor_id(self): + # EBX + ebx = self._run_asm( + self._zero_eax(), + b"\x0F\xA2" # cpuid + b"\x89\xD8" # mov ax,bx + b"\xC3" # ret + ) + + # ECX + ecx = self._run_asm( + self._zero_eax(), + b"\x0f\xa2" # cpuid + b"\x89\xC8" # mov ax,cx + b"\xC3" # ret + ) + + # EDX + edx = self._run_asm( + self._zero_eax(), + b"\x0f\xa2" # cpuid + b"\x89\xD0" # mov ax,dx + b"\xC3" # ret + ) + + # Each 4bits is a ascii letter in the name + vendor_id = [] + for reg in [ebx, edx, ecx]: + for n in [0, 8, 16, 24]: + vendor_id.append(chr((reg >> n) & 0xFF)) + vendor_id = ''.join(vendor_id) + + return vendor_id + + # http://en.wikipedia.org/wiki/CPUID#EAX.3D1:_Processor_Info_and_Feature_Bits + def get_info(self): + # EAX + eax = self._run_asm( + self._one_eax(), + b"\x0f\xa2" # cpuid + b"\xC3" # ret + ) + + # Get the CPU info + stepping = (eax >> 0) & 0xF # 4 bits + model = (eax >> 4) & 0xF # 4 bits + family = (eax >> 8) & 0xF # 4 bits + processor_type = (eax >> 12) & 0x3 # 2 bits + extended_model = (eax >> 16) & 0xF # 4 bits + extended_family = (eax >> 20) & 0xFF # 8 bits + + return { + 'stepping' : stepping, + 'model' : model, + 'family' : family, + 'processor_type' : processor_type, + 'extended_model' : extended_model, + 'extended_family' : extended_family + } + + # http://en.wikipedia.org/wiki/CPUID#EAX.3D80000000h:_Get_Highest_Extended_Function_Supported + def get_max_extension_support(self): + # Check for extension support + max_extension_support = self._run_asm( + b"\xB8\x00\x00\x00\x80" # mov ax,0x80000000 + b"\x0f\xa2" # cpuid + b"\xC3" # ret + ) + + return max_extension_support + + # http://en.wikipedia.org/wiki/CPUID#EAX.3D1:_Processor_Info_and_Feature_Bits + def get_flags(self, max_extension_support): + # EDX + edx = self._run_asm( + self._one_eax(), + b"\x0f\xa2" # cpuid + b"\x89\xD0" # mov ax,dx + b"\xC3" # ret + ) + + # ECX + ecx = self._run_asm( + self._one_eax(), + b"\x0f\xa2" # cpuid + b"\x89\xC8" # mov ax,cx + b"\xC3" # ret + ) + + # Get the CPU flags + flags = { + 'fpu' : is_bit_set(edx, 0), + 'vme' : is_bit_set(edx, 1), + 'de' : is_bit_set(edx, 2), + 'pse' : is_bit_set(edx, 3), + 'tsc' : is_bit_set(edx, 4), + 'msr' : is_bit_set(edx, 5), + 'pae' : is_bit_set(edx, 6), + 'mce' : is_bit_set(edx, 7), + 'cx8' : is_bit_set(edx, 8), + 'apic' : is_bit_set(edx, 9), + #'reserved1' : is_bit_set(edx, 10), + 'sep' : is_bit_set(edx, 11), + 'mtrr' : is_bit_set(edx, 12), + 'pge' : is_bit_set(edx, 13), + 'mca' : is_bit_set(edx, 14), + 'cmov' : is_bit_set(edx, 15), + 'pat' : is_bit_set(edx, 16), + 'pse36' : is_bit_set(edx, 17), + 'pn' : is_bit_set(edx, 18), + 'clflush' : is_bit_set(edx, 19), + #'reserved2' : is_bit_set(edx, 20), + 'dts' : is_bit_set(edx, 21), + 'acpi' : is_bit_set(edx, 22), + 'mmx' : is_bit_set(edx, 23), + 'fxsr' : is_bit_set(edx, 24), + 'sse' : is_bit_set(edx, 25), + 'sse2' : is_bit_set(edx, 26), + 'ss' : is_bit_set(edx, 27), + 'ht' : is_bit_set(edx, 28), + 'tm' : is_bit_set(edx, 29), + 'ia64' : is_bit_set(edx, 30), + 'pbe' : is_bit_set(edx, 31), + + 'pni' : is_bit_set(ecx, 0), + 'pclmulqdq' : is_bit_set(ecx, 1), + 'dtes64' : is_bit_set(ecx, 2), + 'monitor' : is_bit_set(ecx, 3), + 'ds_cpl' : is_bit_set(ecx, 4), + 'vmx' : is_bit_set(ecx, 5), + 'smx' : is_bit_set(ecx, 6), + 'est' : is_bit_set(ecx, 7), + 'tm2' : is_bit_set(ecx, 8), + 'ssse3' : is_bit_set(ecx, 9), + 'cid' : is_bit_set(ecx, 10), + #'reserved3' : is_bit_set(ecx, 11), + 'fma' : is_bit_set(ecx, 12), + 'cx16' : is_bit_set(ecx, 13), + 'xtpr' : is_bit_set(ecx, 14), + 'pdcm' : is_bit_set(ecx, 15), + #'reserved4' : is_bit_set(ecx, 16), + 'pcid' : is_bit_set(ecx, 17), + 'dca' : is_bit_set(ecx, 18), + 'sse4_1' : is_bit_set(ecx, 19), + 'sse4_2' : is_bit_set(ecx, 20), + 'x2apic' : is_bit_set(ecx, 21), + 'movbe' : is_bit_set(ecx, 22), + 'popcnt' : is_bit_set(ecx, 23), + 'tscdeadline' : is_bit_set(ecx, 24), + 'aes' : is_bit_set(ecx, 25), + 'xsave' : is_bit_set(ecx, 26), + 'osxsave' : is_bit_set(ecx, 27), + 'avx' : is_bit_set(ecx, 28), + 'f16c' : is_bit_set(ecx, 29), + 'rdrnd' : is_bit_set(ecx, 30), + 'hypervisor' : is_bit_set(ecx, 31) + } + + # Get a list of only the flags that are true + flags = [k for k, v in flags.items() if v] + + # http://en.wikipedia.org/wiki/CPUID#EAX.3D7.2C_ECX.3D0:_Extended_Features + if max_extension_support >= 7: + # EBX + ebx = self._run_asm( + self._zero_ecx(), + b"\xB8\x07\x00\x00\x00" # mov eax,7 + b"\x0f\xa2" # cpuid + b"\x89\xD8" # mov ax,bx + b"\xC3" # ret + ) + + # ECX + ecx = self._run_asm( + self._zero_ecx(), + b"\xB8\x07\x00\x00\x00" # mov eax,7 + b"\x0f\xa2" # cpuid + b"\x89\xC8" # mov ax,cx + b"\xC3" # ret + ) + + # Get the extended CPU flags + extended_flags = { + #'fsgsbase' : is_bit_set(ebx, 0), + #'IA32_TSC_ADJUST' : is_bit_set(ebx, 1), + 'sgx' : is_bit_set(ebx, 2), + 'bmi1' : is_bit_set(ebx, 3), + 'hle' : is_bit_set(ebx, 4), + 'avx2' : is_bit_set(ebx, 5), + #'reserved' : is_bit_set(ebx, 6), + 'smep' : is_bit_set(ebx, 7), + 'bmi2' : is_bit_set(ebx, 8), + 'erms' : is_bit_set(ebx, 9), + 'invpcid' : is_bit_set(ebx, 10), + 'rtm' : is_bit_set(ebx, 11), + 'pqm' : is_bit_set(ebx, 12), + #'FPU CS and FPU DS deprecated' : is_bit_set(ebx, 13), + 'mpx' : is_bit_set(ebx, 14), + 'pqe' : is_bit_set(ebx, 15), + 'avx512f' : is_bit_set(ebx, 16), + 'avx512dq' : is_bit_set(ebx, 17), + 'rdseed' : is_bit_set(ebx, 18), + 'adx' : is_bit_set(ebx, 19), + 'smap' : is_bit_set(ebx, 20), + 'avx512ifma' : is_bit_set(ebx, 21), + 'pcommit' : is_bit_set(ebx, 22), + 'clflushopt' : is_bit_set(ebx, 23), + 'clwb' : is_bit_set(ebx, 24), + 'intel_pt' : is_bit_set(ebx, 25), + 'avx512pf' : is_bit_set(ebx, 26), + 'avx512er' : is_bit_set(ebx, 27), + 'avx512cd' : is_bit_set(ebx, 28), + 'sha' : is_bit_set(ebx, 29), + 'avx512bw' : is_bit_set(ebx, 30), + 'avx512vl' : is_bit_set(ebx, 31), + + 'prefetchwt1' : is_bit_set(ecx, 0), + 'avx512vbmi' : is_bit_set(ecx, 1), + 'umip' : is_bit_set(ecx, 2), + 'pku' : is_bit_set(ecx, 3), + 'ospke' : is_bit_set(ecx, 4), + #'reserved' : is_bit_set(ecx, 5), + 'avx512vbmi2' : is_bit_set(ecx, 6), + #'reserved' : is_bit_set(ecx, 7), + 'gfni' : is_bit_set(ecx, 8), + 'vaes' : is_bit_set(ecx, 9), + 'vpclmulqdq' : is_bit_set(ecx, 10), + 'avx512vnni' : is_bit_set(ecx, 11), + 'avx512bitalg' : is_bit_set(ecx, 12), + #'reserved' : is_bit_set(ecx, 13), + 'avx512vpopcntdq' : is_bit_set(ecx, 14), + #'reserved' : is_bit_set(ecx, 15), + #'reserved' : is_bit_set(ecx, 16), + #'mpx0' : is_bit_set(ecx, 17), + #'mpx1' : is_bit_set(ecx, 18), + #'mpx2' : is_bit_set(ecx, 19), + #'mpx3' : is_bit_set(ecx, 20), + #'mpx4' : is_bit_set(ecx, 21), + 'rdpid' : is_bit_set(ecx, 22), + #'reserved' : is_bit_set(ecx, 23), + #'reserved' : is_bit_set(ecx, 24), + #'reserved' : is_bit_set(ecx, 25), + #'reserved' : is_bit_set(ecx, 26), + #'reserved' : is_bit_set(ecx, 27), + #'reserved' : is_bit_set(ecx, 28), + #'reserved' : is_bit_set(ecx, 29), + 'sgx_lc' : is_bit_set(ecx, 30), + #'reserved' : is_bit_set(ecx, 31) + } + + # Get a list of only the flags that are true + extended_flags = [k for k, v in extended_flags.items() if v] + flags += extended_flags + + # http://en.wikipedia.org/wiki/CPUID#EAX.3D80000001h:_Extended_Processor_Info_and_Feature_Bits + if max_extension_support >= 0x80000001: + # EBX + ebx = self._run_asm( + b"\xB8\x01\x00\x00\x80" # mov ax,0x80000001 + b"\x0f\xa2" # cpuid + b"\x89\xD8" # mov ax,bx + b"\xC3" # ret + ) + + # ECX + ecx = self._run_asm( + b"\xB8\x01\x00\x00\x80" # mov ax,0x80000001 + b"\x0f\xa2" # cpuid + b"\x89\xC8" # mov ax,cx + b"\xC3" # ret + ) + + # Get the extended CPU flags + extended_flags = { + 'fpu' : is_bit_set(ebx, 0), + 'vme' : is_bit_set(ebx, 1), + 'de' : is_bit_set(ebx, 2), + 'pse' : is_bit_set(ebx, 3), + 'tsc' : is_bit_set(ebx, 4), + 'msr' : is_bit_set(ebx, 5), + 'pae' : is_bit_set(ebx, 6), + 'mce' : is_bit_set(ebx, 7), + 'cx8' : is_bit_set(ebx, 8), + 'apic' : is_bit_set(ebx, 9), + #'reserved' : is_bit_set(ebx, 10), + 'syscall' : is_bit_set(ebx, 11), + 'mtrr' : is_bit_set(ebx, 12), + 'pge' : is_bit_set(ebx, 13), + 'mca' : is_bit_set(ebx, 14), + 'cmov' : is_bit_set(ebx, 15), + 'pat' : is_bit_set(ebx, 16), + 'pse36' : is_bit_set(ebx, 17), + #'reserved' : is_bit_set(ebx, 18), + 'mp' : is_bit_set(ebx, 19), + 'nx' : is_bit_set(ebx, 20), + #'reserved' : is_bit_set(ebx, 21), + 'mmxext' : is_bit_set(ebx, 22), + 'mmx' : is_bit_set(ebx, 23), + 'fxsr' : is_bit_set(ebx, 24), + 'fxsr_opt' : is_bit_set(ebx, 25), + 'pdpe1gp' : is_bit_set(ebx, 26), + 'rdtscp' : is_bit_set(ebx, 27), + #'reserved' : is_bit_set(ebx, 28), + 'lm' : is_bit_set(ebx, 29), + '3dnowext' : is_bit_set(ebx, 30), + '3dnow' : is_bit_set(ebx, 31), + + 'lahf_lm' : is_bit_set(ecx, 0), + 'cmp_legacy' : is_bit_set(ecx, 1), + 'svm' : is_bit_set(ecx, 2), + 'extapic' : is_bit_set(ecx, 3), + 'cr8_legacy' : is_bit_set(ecx, 4), + 'abm' : is_bit_set(ecx, 5), + 'sse4a' : is_bit_set(ecx, 6), + 'misalignsse' : is_bit_set(ecx, 7), + '3dnowprefetch' : is_bit_set(ecx, 8), + 'osvw' : is_bit_set(ecx, 9), + 'ibs' : is_bit_set(ecx, 10), + 'xop' : is_bit_set(ecx, 11), + 'skinit' : is_bit_set(ecx, 12), + 'wdt' : is_bit_set(ecx, 13), + #'reserved' : is_bit_set(ecx, 14), + 'lwp' : is_bit_set(ecx, 15), + 'fma4' : is_bit_set(ecx, 16), + 'tce' : is_bit_set(ecx, 17), + #'reserved' : is_bit_set(ecx, 18), + 'nodeid_msr' : is_bit_set(ecx, 19), + #'reserved' : is_bit_set(ecx, 20), + 'tbm' : is_bit_set(ecx, 21), + 'topoext' : is_bit_set(ecx, 22), + 'perfctr_core' : is_bit_set(ecx, 23), + 'perfctr_nb' : is_bit_set(ecx, 24), + #'reserved' : is_bit_set(ecx, 25), + 'dbx' : is_bit_set(ecx, 26), + 'perftsc' : is_bit_set(ecx, 27), + 'pci_l2i' : is_bit_set(ecx, 28), + #'reserved' : is_bit_set(ecx, 29), + #'reserved' : is_bit_set(ecx, 30), + #'reserved' : is_bit_set(ecx, 31) + } + + # Get a list of only the flags that are true + extended_flags = [k for k, v in extended_flags.items() if v] + flags += extended_flags + + flags.sort() + return flags + + # http://en.wikipedia.org/wiki/CPUID#EAX.3D80000002h.2C80000003h.2C80000004h:_Processor_Brand_String + def get_processor_brand(self, max_extension_support): + processor_brand = "" + + # Processor brand string + if max_extension_support >= 0x80000004: + instructions = [ + b"\xB8\x02\x00\x00\x80", # mov ax,0x80000002 + b"\xB8\x03\x00\x00\x80", # mov ax,0x80000003 + b"\xB8\x04\x00\x00\x80" # mov ax,0x80000004 + ] + for instruction in instructions: + # EAX + eax = self._run_asm( + instruction, # mov ax,0x8000000? + b"\x0f\xa2" # cpuid + b"\x89\xC0" # mov ax,ax + b"\xC3" # ret + ) + + # EBX + ebx = self._run_asm( + instruction, # mov ax,0x8000000? + b"\x0f\xa2" # cpuid + b"\x89\xD8" # mov ax,bx + b"\xC3" # ret + ) + + # ECX + ecx = self._run_asm( + instruction, # mov ax,0x8000000? + b"\x0f\xa2" # cpuid + b"\x89\xC8" # mov ax,cx + b"\xC3" # ret + ) + + # EDX + edx = self._run_asm( + instruction, # mov ax,0x8000000? + b"\x0f\xa2" # cpuid + b"\x89\xD0" # mov ax,dx + b"\xC3" # ret + ) + + # Combine each of the 4 bytes in each register into the string + for reg in [eax, ebx, ecx, edx]: + for n in [0, 8, 16, 24]: + processor_brand += chr((reg >> n) & 0xFF) + + # Strip off any trailing NULL terminators and white space + processor_brand = processor_brand.strip("\0").strip() + + return processor_brand + + # http://en.wikipedia.org/wiki/CPUID#EAX.3D80000006h:_Extended_L2_Cache_Features + def get_cache(self, max_extension_support): + cache_info = {} + + # Just return if the cache feature is not supported + if max_extension_support < 0x80000006: + return cache_info + + # ECX + ecx = self._run_asm( + b"\xB8\x06\x00\x00\x80" # mov ax,0x80000006 + b"\x0f\xa2" # cpuid + b"\x89\xC8" # mov ax,cx + b"\xC3" # ret + ) + + cache_info = { + 'size_kb' : ecx & 0xFF, + 'line_size_b' : (ecx >> 12) & 0xF, + 'associativity' : (ecx >> 16) & 0xFFFF + } + + return cache_info + + def get_ticks(self): + retval = None + + if DataSource.bits == '32bit': + # Works on x86_32 + restype = None + argtypes = (ctypes.POINTER(ctypes.c_uint), ctypes.POINTER(ctypes.c_uint)) + get_ticks_x86_32, address = self._asm_func(restype, argtypes, + [ + b"\x55", # push bp + b"\x89\xE5", # mov bp,sp + b"\x31\xC0", # xor ax,ax + b"\x0F\xA2", # cpuid + b"\x0F\x31", # rdtsc + b"\x8B\x5D\x08", # mov bx,[di+0x8] + b"\x8B\x4D\x0C", # mov cx,[di+0xc] + b"\x89\x13", # mov [bp+di],dx + b"\x89\x01", # mov [bx+di],ax + b"\x5D", # pop bp + b"\xC3" # ret + ] + ) + + high = ctypes.c_uint32(0) + low = ctypes.c_uint32(0) + + get_ticks_x86_32(ctypes.byref(high), ctypes.byref(low)) + retval = ((high.value << 32) & 0xFFFFFFFF00000000) | low.value + elif DataSource.bits == '64bit': + # Works on x86_64 + restype = ctypes.c_uint64 + argtypes = () + get_ticks_x86_64, address = self._asm_func(restype, argtypes, + [ + b"\x48", # dec ax + b"\x31\xC0", # xor ax,ax + b"\x0F\xA2", # cpuid + b"\x0F\x31", # rdtsc + b"\x48", # dec ax + b"\xC1\xE2\x20", # shl dx,byte 0x20 + b"\x48", # dec ax + b"\x09\xD0", # or ax,dx + b"\xC3", # ret + ] + ) + retval = get_ticks_x86_64() + + return retval + + def get_raw_hz(self): + start = self.get_ticks() + + time.sleep(1) + + end = self.get_ticks() + + ticks = (end - start) + + return ticks + +def _actual_get_cpu_info_from_cpuid(queue): + ''' + Warning! This function has the potential to crash the Python runtime. + Do not call it directly. Use the _get_cpu_info_from_cpuid function instead. + It will safely call this function in another process. + ''' + + # Pipe all output to nothing + sys.stdout = open(os.devnull, 'w') + sys.stderr = open(os.devnull, 'w') + + # Get the CPU arch and bits + arch, bits = parse_arch(DataSource.raw_arch_string) + + # Return none if this is not an X86 CPU + if not arch in ['X86_32', 'X86_64']: + queue.put(obj_to_b64({})) + return + + # Return none if SE Linux is in enforcing mode + cpuid = CPUID() + if cpuid.is_selinux_enforcing: + queue.put(obj_to_b64({})) + return + + # Get the cpu info from the CPUID register + max_extension_support = cpuid.get_max_extension_support() + cache_info = cpuid.get_cache(max_extension_support) + info = cpuid.get_info() + + processor_brand = cpuid.get_processor_brand(max_extension_support) + + # Get the Hz and scale + hz_actual = cpuid.get_raw_hz() + hz_actual = to_hz_string(hz_actual) + + # Get the Hz and scale + scale, hz_advertised = _get_hz_string_from_brand(processor_brand) + info = { + 'vendor_id' : cpuid.get_vendor_id(), + 'hardware' : '', + 'brand' : processor_brand, + + 'hz_advertised' : to_friendly_hz(hz_advertised, scale), + 'hz_actual' : to_friendly_hz(hz_actual, 0), + 'hz_advertised_raw' : to_raw_hz(hz_advertised, scale), + 'hz_actual_raw' : to_raw_hz(hz_actual, 0), + + 'l2_cache_size' : to_friendly_bytes(cache_info['size_kb']), + 'l2_cache_line_size' : cache_info['line_size_b'], + 'l2_cache_associativity' : hex(cache_info['associativity']), + + 'stepping' : info['stepping'], + 'model' : info['model'], + 'family' : info['family'], + 'processor_type' : info['processor_type'], + 'extended_model' : info['extended_model'], + 'extended_family' : info['extended_family'], + 'flags' : cpuid.get_flags(max_extension_support) + } + + info = {k: v for k, v in info.items() if v} + queue.put(obj_to_b64(info)) + +def _get_cpu_info_from_cpuid(): + ''' + Returns the CPU info gathered by querying the X86 cpuid register in a new process. + Returns {} on non X86 cpus. + Returns {} if SELinux is in enforcing mode. + ''' + from multiprocessing import Process, Queue + + # Return {} if can't cpuid + if not DataSource.can_cpuid: + return {} + + # Get the CPU arch and bits + arch, bits = parse_arch(DataSource.raw_arch_string) + + # Return {} if this is not an X86 CPU + if not arch in ['X86_32', 'X86_64']: + return {} + + try: + # Start running the function in a subprocess + queue = Queue() + p = Process(target=_actual_get_cpu_info_from_cpuid, args=(queue,)) + p.start() + + # Wait for the process to end, while it is still alive + while p.is_alive(): + p.join(0) + + # Return {} if it failed + if p.exitcode != 0: + return {} + + # Return the result, only if there is something to read + if not queue.empty(): + output = queue.get() + return b64_to_obj(output) + except: + pass + + # Return {} if everything failed + return {} + +def _get_cpu_info_from_proc_cpuinfo(): + ''' + Returns the CPU info gathered from /proc/cpuinfo. + Returns {} if /proc/cpuinfo is not found. + ''' + try: + # Just return {} if there is no cpuinfo + if not DataSource.has_proc_cpuinfo(): + return {} + + returncode, output = DataSource.cat_proc_cpuinfo() + if returncode != 0: + return {} + + # Various fields + vendor_id = _get_field(False, output, None, '', 'vendor_id', 'vendor id', 'vendor') + processor_brand = _get_field(True, output, None, None, 'model name','cpu', 'processor') + cache_size = _get_field(False, output, None, '', 'cache size') + stepping = _get_field(False, output, int, 0, 'stepping') + model = _get_field(False, output, int, 0, 'model') + family = _get_field(False, output, int, 0, 'cpu family') + hardware = _get_field(False, output, None, '', 'Hardware') + # Flags + flags = _get_field(False, output, None, None, 'flags', 'Features') + if flags: + flags = flags.split() + flags.sort() + + # Convert from MHz string to Hz + hz_actual = _get_field(False, output, None, '', 'cpu MHz', 'cpu speed', 'clock') + hz_actual = hz_actual.lower().rstrip('mhz').strip() + hz_actual = to_hz_string(hz_actual) + + # Convert from GHz/MHz string to Hz + scale, hz_advertised = (0, None) + try: + scale, hz_advertised = _get_hz_string_from_brand(processor_brand) + except Exception: + pass + + info = { + 'hardware' : hardware, + 'brand' : processor_brand, + + 'l3_cache_size' : to_friendly_bytes(cache_size), + 'flags' : flags, + 'vendor_id' : vendor_id, + 'stepping' : stepping, + 'model' : model, + 'family' : family, + } + + # Make the Hz the same for actual and advertised if missing any + if not hz_advertised or hz_advertised == '0.0': + hz_advertised = hz_actual + scale = 6 + elif not hz_actual or hz_actual == '0.0': + hz_actual = hz_advertised + + # Add the Hz if there is one + if to_raw_hz(hz_advertised, scale) > (0, 0): + info['hz_advertised'] = to_friendly_hz(hz_advertised, scale) + info['hz_advertised_raw'] = to_raw_hz(hz_advertised, scale) + if to_raw_hz(hz_actual, scale) > (0, 0): + info['hz_actual'] = to_friendly_hz(hz_actual, 6) + info['hz_actual_raw'] = to_raw_hz(hz_actual, 6) + + info = {k: v for k, v in info.items() if v} + return info + except: + #raise # NOTE: To have this throw on error, uncomment this line + return {} + +def _get_cpu_info_from_cpufreq_info(): + ''' + Returns the CPU info gathered from cpufreq-info. + Returns {} if cpufreq-info is not found. + ''' + try: + scale, hz_brand = 1, '0.0' + + if not DataSource.has_cpufreq_info(): + return {} + + returncode, output = DataSource.cpufreq_info() + if returncode != 0: + return {} + + hz_brand = output.split('current CPU frequency is')[1].split('\n')[0] + i = hz_brand.find('Hz') + assert(i != -1) + hz_brand = hz_brand[0 : i+2].strip().lower() + + if hz_brand.endswith('mhz'): + scale = 6 + elif hz_brand.endswith('ghz'): + scale = 9 + hz_brand = hz_brand.rstrip('mhz').rstrip('ghz').strip() + hz_brand = to_hz_string(hz_brand) + + info = { + 'hz_advertised' : to_friendly_hz(hz_brand, scale), + 'hz_actual' : to_friendly_hz(hz_brand, scale), + 'hz_advertised_raw' : to_raw_hz(hz_brand, scale), + 'hz_actual_raw' : to_raw_hz(hz_brand, scale), + } + + info = {k: v for k, v in info.items() if v} + return info + except: + #raise # NOTE: To have this throw on error, uncomment this line + return {} + +def _get_cpu_info_from_lscpu(): + ''' + Returns the CPU info gathered from lscpu. + Returns {} if lscpu is not found. + ''' + try: + if not DataSource.has_lscpu(): + return {} + + returncode, output = DataSource.lscpu() + if returncode != 0: + return {} + + info = {} + + new_hz = _get_field(False, output, None, None, 'CPU max MHz', 'CPU MHz') + if new_hz: + new_hz = to_hz_string(new_hz) + scale = 6 + info['hz_advertised'] = to_friendly_hz(new_hz, scale) + info['hz_actual'] = to_friendly_hz(new_hz, scale) + info['hz_advertised_raw'] = to_raw_hz(new_hz, scale) + info['hz_actual_raw'] = to_raw_hz(new_hz, scale) + + vendor_id = _get_field(False, output, None, None, 'Vendor ID') + if vendor_id: + info['vendor_id'] = vendor_id + + brand = _get_field(False, output, None, None, 'Model name') + if brand: + info['brand'] = brand + + family = _get_field(False, output, None, None, 'CPU family') + if family and family.isdigit(): + info['family'] = int(family) + + stepping = _get_field(False, output, None, None, 'Stepping') + if stepping and stepping.isdigit(): + info['stepping'] = int(stepping) + + model = _get_field(False, output, None, None, 'Model') + if model and model.isdigit(): + info['model'] = int(model) + + l1_data_cache_size = _get_field(False, output, None, None, 'L1d cache') + if l1_data_cache_size: + info['l1_data_cache_size'] = to_friendly_bytes(l1_data_cache_size) + + l1_instruction_cache_size = _get_field(False, output, None, None, 'L1i cache') + if l1_instruction_cache_size: + info['l1_instruction_cache_size'] = to_friendly_bytes(l1_instruction_cache_size) + + l2_cache_size = _get_field(False, output, None, None, 'L2 cache') + if l2_cache_size: + info['l2_cache_size'] = to_friendly_bytes(l2_cache_size) + + l3_cache_size = _get_field(False, output, None, None, 'L3 cache') + if l3_cache_size: + info['l3_cache_size'] = to_friendly_bytes(l3_cache_size) + + # Flags + flags = _get_field(False, output, None, None, 'flags', 'Features') + if flags: + flags = flags.split() + flags.sort() + info['flags'] = flags + + info = {k: v for k, v in info.items() if v} + return info + except: + #raise # NOTE: To have this throw on error, uncomment this line + return {} + +def _get_cpu_info_from_dmesg(): + ''' + Returns the CPU info gathered from dmesg. + Returns {} if dmesg is not found or does not have the desired info. + ''' + # Just return {} if there is no dmesg + if not DataSource.has_dmesg(): + return {} + + # If dmesg fails return {} + returncode, output = DataSource.dmesg_a() + if output == None or returncode != 0: + return {} + + return _parse_dmesg_output(output) + + +# https://openpowerfoundation.org/wp-content/uploads/2016/05/LoPAPR_DRAFT_v11_24March2016_cmt1.pdf +# page 767 +def _get_cpu_info_from_ibm_pa_features(): + ''' + Returns the CPU info gathered from lsprop /proc/device-tree/cpus/*/ibm,pa-features + Returns {} if lsprop is not found or ibm,pa-features does not have the desired info. + ''' + try: + # Just return {} if there is no lsprop + if not DataSource.has_ibm_pa_features(): + return {} + + # If ibm,pa-features fails return {} + returncode, output = DataSource.ibm_pa_features() + if output == None or returncode != 0: + return {} + + # Filter out invalid characters from output + value = output.split("ibm,pa-features")[1].lower() + value = [s for s in value if s in list('0123456789abcfed')] + value = ''.join(value) + + # Get data converted to Uint32 chunks + left = int(value[0 : 8], 16) + right = int(value[8 : 16], 16) + + # Get the CPU flags + flags = { + # Byte 0 + 'mmu' : is_bit_set(left, 0), + 'fpu' : is_bit_set(left, 1), + 'slb' : is_bit_set(left, 2), + 'run' : is_bit_set(left, 3), + #'reserved' : is_bit_set(left, 4), + 'dabr' : is_bit_set(left, 5), + 'ne' : is_bit_set(left, 6), + 'wtr' : is_bit_set(left, 7), + + # Byte 1 + 'mcr' : is_bit_set(left, 8), + 'dsisr' : is_bit_set(left, 9), + 'lp' : is_bit_set(left, 10), + 'ri' : is_bit_set(left, 11), + 'dabrx' : is_bit_set(left, 12), + 'sprg3' : is_bit_set(left, 13), + 'rislb' : is_bit_set(left, 14), + 'pp' : is_bit_set(left, 15), + + # Byte 2 + 'vpm' : is_bit_set(left, 16), + 'dss_2.05' : is_bit_set(left, 17), + #'reserved' : is_bit_set(left, 18), + 'dar' : is_bit_set(left, 19), + #'reserved' : is_bit_set(left, 20), + 'ppr' : is_bit_set(left, 21), + 'dss_2.02' : is_bit_set(left, 22), + 'dss_2.06' : is_bit_set(left, 23), + + # Byte 3 + 'lsd_in_dscr' : is_bit_set(left, 24), + 'ugr_in_dscr' : is_bit_set(left, 25), + #'reserved' : is_bit_set(left, 26), + #'reserved' : is_bit_set(left, 27), + #'reserved' : is_bit_set(left, 28), + #'reserved' : is_bit_set(left, 29), + #'reserved' : is_bit_set(left, 30), + #'reserved' : is_bit_set(left, 31), + + # Byte 4 + 'sso_2.06' : is_bit_set(right, 0), + #'reserved' : is_bit_set(right, 1), + #'reserved' : is_bit_set(right, 2), + #'reserved' : is_bit_set(right, 3), + #'reserved' : is_bit_set(right, 4), + #'reserved' : is_bit_set(right, 5), + #'reserved' : is_bit_set(right, 6), + #'reserved' : is_bit_set(right, 7), + + # Byte 5 + 'le' : is_bit_set(right, 8), + 'cfar' : is_bit_set(right, 9), + 'eb' : is_bit_set(right, 10), + 'lsq_2.07' : is_bit_set(right, 11), + #'reserved' : is_bit_set(right, 12), + #'reserved' : is_bit_set(right, 13), + #'reserved' : is_bit_set(right, 14), + #'reserved' : is_bit_set(right, 15), + + # Byte 6 + 'dss_2.07' : is_bit_set(right, 16), + #'reserved' : is_bit_set(right, 17), + #'reserved' : is_bit_set(right, 18), + #'reserved' : is_bit_set(right, 19), + #'reserved' : is_bit_set(right, 20), + #'reserved' : is_bit_set(right, 21), + #'reserved' : is_bit_set(right, 22), + #'reserved' : is_bit_set(right, 23), + + # Byte 7 + #'reserved' : is_bit_set(right, 24), + #'reserved' : is_bit_set(right, 25), + #'reserved' : is_bit_set(right, 26), + #'reserved' : is_bit_set(right, 27), + #'reserved' : is_bit_set(right, 28), + #'reserved' : is_bit_set(right, 29), + #'reserved' : is_bit_set(right, 30), + #'reserved' : is_bit_set(right, 31), + } + + # Get a list of only the flags that are true + flags = [k for k, v in flags.items() if v] + flags.sort() + + info = { + 'flags' : flags + } + info = {k: v for k, v in info.items() if v} + + return info + except: + return {} + + +def _get_cpu_info_from_cat_var_run_dmesg_boot(): + ''' + Returns the CPU info gathered from /var/run/dmesg.boot. + Returns {} if dmesg is not found or does not have the desired info. + ''' + # Just return {} if there is no /var/run/dmesg.boot + if not DataSource.has_var_run_dmesg_boot(): + return {} + + # If dmesg.boot fails return {} + returncode, output = DataSource.cat_var_run_dmesg_boot() + if output == None or returncode != 0: + return {} + + return _parse_dmesg_output(output) + + +def _get_cpu_info_from_sysctl(): + ''' + Returns the CPU info gathered from sysctl. + Returns {} if sysctl is not found. + ''' + try: + # Just return {} if there is no sysctl + if not DataSource.has_sysctl(): + return {} + + # If sysctl fails return {} + returncode, output = DataSource.sysctl_machdep_cpu_hw_cpufrequency() + if output == None or returncode != 0: + return {} + + # Various fields + vendor_id = _get_field(False, output, None, None, 'machdep.cpu.vendor') + processor_brand = _get_field(True, output, None, None, 'machdep.cpu.brand_string') + cache_size = _get_field(False, output, None, None, 'machdep.cpu.cache.size') + stepping = _get_field(False, output, int, 0, 'machdep.cpu.stepping') + model = _get_field(False, output, int, 0, 'machdep.cpu.model') + family = _get_field(False, output, int, 0, 'machdep.cpu.family') + + # Flags + flags = _get_field(False, output, None, '', 'machdep.cpu.features').lower().split() + flags.extend(_get_field(False, output, None, '', 'machdep.cpu.leaf7_features').lower().split()) + flags.extend(_get_field(False, output, None, '', 'machdep.cpu.extfeatures').lower().split()) + flags.sort() + + # Convert from GHz/MHz string to Hz + scale, hz_advertised = _get_hz_string_from_brand(processor_brand) + hz_actual = _get_field(False, output, None, None, 'hw.cpufrequency') + hz_actual = to_hz_string(hz_actual) + + info = { + 'vendor_id' : vendor_id, + 'brand' : processor_brand, + + 'hz_advertised' : to_friendly_hz(hz_advertised, scale), + 'hz_actual' : to_friendly_hz(hz_actual, 0), + 'hz_advertised_raw' : to_raw_hz(hz_advertised, scale), + 'hz_actual_raw' : to_raw_hz(hz_actual, 0), + + 'l2_cache_size' : to_friendly_bytes(cache_size), + + 'stepping' : stepping, + 'model' : model, + 'family' : family, + 'flags' : flags + } + + info = {k: v for k, v in info.items() if v} + return info + except: + return {} + + +def _get_cpu_info_from_sysinfo(): + ''' + Returns the CPU info gathered from sysinfo. + Returns {} if sysinfo is not found. + ''' + info = _get_cpu_info_from_sysinfo_v1() + info.update(_get_cpu_info_from_sysinfo_v2()) + return info + +def _get_cpu_info_from_sysinfo_v1(): + ''' + Returns the CPU info gathered from sysinfo. + Returns {} if sysinfo is not found. + ''' + try: + # Just return {} if there is no sysinfo + if not DataSource.has_sysinfo(): + return {} + + # If sysinfo fails return {} + returncode, output = DataSource.sysinfo_cpu() + if output == None or returncode != 0: + return {} + + # Various fields + vendor_id = '' #_get_field(False, output, None, None, 'CPU #0: ') + processor_brand = output.split('CPU #0: "')[1].split('"\n')[0] + cache_size = '' #_get_field(False, output, None, None, 'machdep.cpu.cache.size') + stepping = int(output.split(', stepping ')[1].split(',')[0].strip()) + model = int(output.split(', model ')[1].split(',')[0].strip()) + family = int(output.split(', family ')[1].split(',')[0].strip()) + + # Flags + flags = [] + for line in output.split('\n'): + if line.startswith('\t\t'): + for flag in line.strip().lower().split(): + flags.append(flag) + flags.sort() + + # Convert from GHz/MHz string to Hz + scale, hz_advertised = _get_hz_string_from_brand(processor_brand) + hz_actual = hz_advertised + + info = { + 'vendor_id' : vendor_id, + 'brand' : processor_brand, + + 'hz_advertised' : to_friendly_hz(hz_advertised, scale), + 'hz_actual' : to_friendly_hz(hz_actual, scale), + 'hz_advertised_raw' : to_raw_hz(hz_advertised, scale), + 'hz_actual_raw' : to_raw_hz(hz_actual, scale), + + 'l2_cache_size' : to_friendly_bytes(cache_size), + + 'stepping' : stepping, + 'model' : model, + 'family' : family, + 'flags' : flags + } + + info = {k: v for k, v in info.items() if v} + return info + except: + return {} + +def _get_cpu_info_from_sysinfo_v2(): + ''' + Returns the CPU info gathered from sysinfo. + Returns {} if sysinfo is not found. + ''' + try: + # Just return {} if there is no sysinfo + if not DataSource.has_sysinfo(): + return {} + + # If sysinfo fails return {} + returncode, output = DataSource.sysinfo_cpu() + if output == None or returncode != 0: + return {} + + # Various fields + vendor_id = '' #_get_field(False, output, None, None, 'CPU #0: ') + processor_brand = output.split('CPU #0: "')[1].split('"\n')[0] + cache_size = '' #_get_field(False, output, None, None, 'machdep.cpu.cache.size') + signature = output.split('Signature:')[1].split('\n')[0].strip() + # + stepping = int(signature.split('stepping ')[1].split(',')[0].strip()) + model = int(signature.split('model ')[1].split(',')[0].strip()) + family = int(signature.split('family ')[1].split(',')[0].strip()) + + # Flags + def get_subsection_flags(output): + retval = [] + for line in output.split('\n')[1:]: + if not line.startswith(' '): break + for entry in line.strip().lower().split(' '): + retval.append(entry) + return retval + + flags = get_subsection_flags(output.split('Features: ')[1]) + \ + get_subsection_flags(output.split('Extended Features (0x00000001): ')[1]) + \ + get_subsection_flags(output.split('Extended Features (0x80000001): ')[1]) + flags.sort() + + # Convert from GHz/MHz string to Hz + scale, hz_advertised = _get_hz_string_from_brand(processor_brand) + hz_actual = hz_advertised + + info = { + 'vendor_id' : vendor_id, + 'brand' : processor_brand, + + 'hz_advertised' : to_friendly_hz(hz_advertised, scale), + 'hz_actual' : to_friendly_hz(hz_actual, scale), + 'hz_advertised_raw' : to_raw_hz(hz_advertised, scale), + 'hz_actual_raw' : to_raw_hz(hz_actual, scale), + + 'l2_cache_size' : to_friendly_bytes(cache_size), + + 'stepping' : stepping, + 'model' : model, + 'family' : family, + 'flags' : flags + } + + info = {k: v for k, v in info.items() if v} + return info + except: + return {} + +def _get_cpu_info_from_wmic(): + ''' + Returns the CPU info gathered from WMI. + Returns {} if not on Windows, or wmic is not installed. + ''' + + try: + # Just return {} if not Windows or there is no wmic + if not DataSource.is_windows or not DataSource.has_wmic(): + return {} + + returncode, output = DataSource.wmic_cpu() + if output == None or returncode != 0: + return {} + + # Break the list into key values pairs + value = output.split("\n") + value = [s.rstrip().split('=') for s in value if '=' in s] + value = {k: v for k, v in value if v} + + # Get the advertised MHz + processor_brand = value.get('Name') + scale_advertised, hz_advertised = _get_hz_string_from_brand(processor_brand) + + # Get the actual MHz + hz_actual = value.get('CurrentClockSpeed') + scale_actual = 6 + if hz_actual: + hz_actual = to_hz_string(hz_actual) + + # Get cache sizes + l2_cache_size = value.get('L2CacheSize') + if l2_cache_size: + l2_cache_size = l2_cache_size + ' KB' + + l3_cache_size = value.get('L3CacheSize') + if l3_cache_size: + l3_cache_size = l3_cache_size + ' KB' + + # Get family, model, and stepping + family, model, stepping = '', '', '' + description = value.get('Description') or value.get('Caption') + entries = description.split(' ') + + if 'Family' in entries and entries.index('Family') < len(entries)-1: + i = entries.index('Family') + family = int(entries[i + 1]) + + if 'Model' in entries and entries.index('Model') < len(entries)-1: + i = entries.index('Model') + model = int(entries[i + 1]) + + if 'Stepping' in entries and entries.index('Stepping') < len(entries)-1: + i = entries.index('Stepping') + stepping = int(entries[i + 1]) + + info = { + 'vendor_id' : value.get('Manufacturer'), + 'brand' : processor_brand, + + 'hz_advertised' : to_friendly_hz(hz_advertised, scale_advertised), + 'hz_actual' : to_friendly_hz(hz_actual, scale_actual), + 'hz_advertised_raw' : to_raw_hz(hz_advertised, scale_advertised), + 'hz_actual_raw' : to_raw_hz(hz_actual, scale_actual), + + 'l2_cache_size' : l2_cache_size, + 'l3_cache_size' : l3_cache_size, + + 'stepping' : stepping, + 'model' : model, + 'family' : family, + } + + info = {k: v for k, v in info.items() if v} + return info + except: + #raise # NOTE: To have this throw on error, uncomment this line + return {} + +def _get_cpu_info_from_registry(): + ''' + FIXME: Is missing many of the newer CPU flags like sse3 + Returns the CPU info gathered from the Windows Registry. + Returns {} if not on Windows. + ''' + try: + # Just return {} if not on Windows + if not DataSource.is_windows: + return {} + + # Get the CPU name + processor_brand = DataSource.winreg_processor_brand() + + # Get the CPU vendor id + vendor_id = DataSource.winreg_vendor_id() + + # Get the CPU arch and bits + raw_arch_string = DataSource.winreg_raw_arch_string() + arch, bits = parse_arch(raw_arch_string) + + # Get the actual CPU Hz + hz_actual = DataSource.winreg_hz_actual() + hz_actual = to_hz_string(hz_actual) + + # Get the advertised CPU Hz + scale, hz_advertised = _get_hz_string_from_brand(processor_brand) + + # Get the CPU features + feature_bits = DataSource.winreg_feature_bits() + + def is_set(bit): + mask = 0x80000000 >> bit + retval = mask & feature_bits > 0 + return retval + + # http://en.wikipedia.org/wiki/CPUID + # http://unix.stackexchange.com/questions/43539/what-do-the-flags-in-proc-cpuinfo-mean + # http://www.lohninger.com/helpcsuite/public_constants_cpuid.htm + flags = { + 'fpu' : is_set(0), # Floating Point Unit + 'vme' : is_set(1), # V86 Mode Extensions + 'de' : is_set(2), # Debug Extensions - I/O breakpoints supported + 'pse' : is_set(3), # Page Size Extensions (4 MB pages supported) + 'tsc' : is_set(4), # Time Stamp Counter and RDTSC instruction are available + 'msr' : is_set(5), # Model Specific Registers + 'pae' : is_set(6), # Physical Address Extensions (36 bit address, 2MB pages) + 'mce' : is_set(7), # Machine Check Exception supported + 'cx8' : is_set(8), # Compare Exchange Eight Byte instruction available + 'apic' : is_set(9), # Local APIC present (multiprocessor operation support) + 'sepamd' : is_set(10), # Fast system calls (AMD only) + 'sep' : is_set(11), # Fast system calls + 'mtrr' : is_set(12), # Memory Type Range Registers + 'pge' : is_set(13), # Page Global Enable + 'mca' : is_set(14), # Machine Check Architecture + 'cmov' : is_set(15), # Conditional MOVe instructions + 'pat' : is_set(16), # Page Attribute Table + 'pse36' : is_set(17), # 36 bit Page Size Extensions + 'serial' : is_set(18), # Processor Serial Number + 'clflush' : is_set(19), # Cache Flush + #'reserved1' : is_set(20), # reserved + 'dts' : is_set(21), # Debug Trace Store + 'acpi' : is_set(22), # ACPI support + 'mmx' : is_set(23), # MultiMedia Extensions + 'fxsr' : is_set(24), # FXSAVE and FXRSTOR instructions + 'sse' : is_set(25), # SSE instructions + 'sse2' : is_set(26), # SSE2 (WNI) instructions + 'ss' : is_set(27), # self snoop + #'reserved2' : is_set(28), # reserved + 'tm' : is_set(29), # Automatic clock control + 'ia64' : is_set(30), # IA64 instructions + '3dnow' : is_set(31) # 3DNow! instructions available + } + + # Get a list of only the flags that are true + flags = [k for k, v in flags.items() if v] + flags.sort() + + info = { + 'vendor_id' : vendor_id, + 'brand' : processor_brand, + + 'hz_advertised' : to_friendly_hz(hz_advertised, scale), + 'hz_actual' : to_friendly_hz(hz_actual, 6), + 'hz_advertised_raw' : to_raw_hz(hz_advertised, scale), + 'hz_actual_raw' : to_raw_hz(hz_actual, 6), + + 'flags' : flags + } + + info = {k: v for k, v in info.items() if v} + return info + except: + return {} + +def _get_cpu_info_from_kstat(): + ''' + Returns the CPU info gathered from isainfo and kstat. + Returns {} if isainfo or kstat are not found. + ''' + try: + # Just return {} if there is no isainfo or kstat + if not DataSource.has_isainfo() or not DataSource.has_kstat(): + return {} + + # If isainfo fails return {} + returncode, flag_output = DataSource.isainfo_vb() + if flag_output == None or returncode != 0: + return {} + + # If kstat fails return {} + returncode, kstat = DataSource.kstat_m_cpu_info() + if kstat == None or returncode != 0: + return {} + + # Various fields + vendor_id = kstat.split('\tvendor_id ')[1].split('\n')[0].strip() + processor_brand = kstat.split('\tbrand ')[1].split('\n')[0].strip() + stepping = int(kstat.split('\tstepping ')[1].split('\n')[0].strip()) + model = int(kstat.split('\tmodel ')[1].split('\n')[0].strip()) + family = int(kstat.split('\tfamily ')[1].split('\n')[0].strip()) + + # Flags + flags = flag_output.strip().split('\n')[-1].strip().lower().split() + flags.sort() + + # Convert from GHz/MHz string to Hz + scale = 6 + hz_advertised = kstat.split('\tclock_MHz ')[1].split('\n')[0].strip() + hz_advertised = to_hz_string(hz_advertised) + + # Convert from GHz/MHz string to Hz + hz_actual = kstat.split('\tcurrent_clock_Hz ')[1].split('\n')[0].strip() + hz_actual = to_hz_string(hz_actual) + + info = { + 'vendor_id' : vendor_id, + 'brand' : processor_brand, + + 'hz_advertised' : to_friendly_hz(hz_advertised, scale), + 'hz_actual' : to_friendly_hz(hz_actual, 0), + 'hz_advertised_raw' : to_raw_hz(hz_advertised, scale), + 'hz_actual_raw' : to_raw_hz(hz_actual, 0), + + 'stepping' : stepping, + 'model' : model, + 'family' : family, + 'flags' : flags + } + + info = {k: v for k, v in info.items() if v} + return info + except: + return {} + +def CopyNewFields(info, new_info): + keys = [ + 'vendor_id', 'hardware', 'brand', 'hz_advertised', 'hz_actual', + 'hz_advertised_raw', 'hz_actual_raw', 'arch', 'bits', 'count', + 'raw_arch_string', 'l2_cache_size', 'l2_cache_line_size', + 'l2_cache_associativity', 'stepping', 'model', 'family', + 'processor_type', 'extended_model', 'extended_family', 'flags', + 'l3_cache_size', 'l1_data_cache_size', 'l1_instruction_cache_size' + ] + + for key in keys: + if new_info.get(key, None) and not info.get(key, None): + info[key] = new_info[key] + elif key == 'flags' and new_info.get('flags'): + for f in new_info['flags']: + if f not in info['flags']: info['flags'].append(f) + info['flags'].sort() + +def get_cpu_info(): + ''' + Returns the CPU info by using the best sources of information for your OS. + Returns {} if nothing is found. + ''' + + # Get the CPU arch and bits + arch, bits = parse_arch(DataSource.raw_arch_string) + + friendly_maxsize = { 2**31-1: '32 bit', 2**63-1: '64 bit' }.get(sys.maxsize) or 'unknown bits' + friendly_version = "{0}.{1}.{2}.{3}.{4}".format(*sys.version_info) + PYTHON_VERSION = "{0} ({1})".format(friendly_version, friendly_maxsize) + + info = { + 'python_version' : PYTHON_VERSION, + 'cpuinfo_version' : CPUINFO_VERSION, + 'arch' : arch, + 'bits' : bits, + 'count' : DataSource.cpu_count, + 'raw_arch_string' : DataSource.raw_arch_string, + } + + # Try the Windows wmic + CopyNewFields(info, _get_cpu_info_from_wmic()) + + # Try the Windows registry + CopyNewFields(info, _get_cpu_info_from_registry()) + + # Try /proc/cpuinfo + CopyNewFields(info, _get_cpu_info_from_proc_cpuinfo()) + + # Try cpufreq-info + CopyNewFields(info, _get_cpu_info_from_cpufreq_info()) + + # Try LSCPU + CopyNewFields(info, _get_cpu_info_from_lscpu()) + + # Try sysctl + CopyNewFields(info, _get_cpu_info_from_sysctl()) + + # Try kstat + CopyNewFields(info, _get_cpu_info_from_kstat()) + + # Try dmesg + CopyNewFields(info, _get_cpu_info_from_dmesg()) + + # Try /var/run/dmesg.boot + CopyNewFields(info, _get_cpu_info_from_cat_var_run_dmesg_boot()) + + # Try lsprop ibm,pa-features + CopyNewFields(info, _get_cpu_info_from_ibm_pa_features()) + + # Try sysinfo + CopyNewFields(info, _get_cpu_info_from_sysinfo()) + + # Try querying the CPU cpuid register + CopyNewFields(info, _get_cpu_info_from_cpuid()) + + return info + +# Make sure we are running on a supported system +def _check_arch(): + arch, bits = parse_arch(DataSource.raw_arch_string) + if not arch in ['X86_32', 'X86_64', 'ARM_7', 'ARM_8', 'PPC_64']: + raise Exception("py-cpuinfo currently only works on X86 and some PPC and ARM CPUs.") + +def main(): + try: + _check_arch() + except Exception as err: + sys.stderr.write(str(err) + "\n") + sys.exit(1) + + info = get_cpu_info() + if info: + print('Python Version: {0}'.format(info.get('python_version', ''))) + print('Cpuinfo Version: {0}'.format(info.get('cpuinfo_version', ''))) + print('Vendor ID: {0}'.format(info.get('vendor_id', ''))) + print('Hardware Raw: {0}'.format(info.get('hardware', ''))) + print('Brand: {0}'.format(info.get('brand', ''))) + print('Hz Advertised: {0}'.format(info.get('hz_advertised', ''))) + print('Hz Actual: {0}'.format(info.get('hz_actual', ''))) + print('Hz Advertised Raw: {0}'.format(info.get('hz_advertised_raw', ''))) + print('Hz Actual Raw: {0}'.format(info.get('hz_actual_raw', ''))) + print('Arch: {0}'.format(info.get('arch', ''))) + print('Bits: {0}'.format(info.get('bits', ''))) + print('Count: {0}'.format(info.get('count', ''))) + + print('Raw Arch String: {0}'.format(info.get('raw_arch_string', ''))) + + print('L1 Data Cache Size: {0}'.format(info.get('l1_data_cache_size', ''))) + print('L1 Instruction Cache Size: {0}'.format(info.get('l1_instruction_cache_size', ''))) + print('L2 Cache Size: {0}'.format(info.get('l2_cache_size', ''))) + print('L2 Cache Line Size: {0}'.format(info.get('l2_cache_line_size', ''))) + print('L2 Cache Associativity: {0}'.format(info.get('l2_cache_associativity', ''))) + print('L3 Cache Size: {0}'.format(info.get('l3_cache_size', ''))) + print('Stepping: {0}'.format(info.get('stepping', ''))) + print('Model: {0}'.format(info.get('model', ''))) + print('Family: {0}'.format(info.get('family', ''))) + print('Processor Type: {0}'.format(info.get('processor_type', ''))) + print('Extended Model: {0}'.format(info.get('extended_model', ''))) + print('Extended Family: {0}'.format(info.get('extended_family', ''))) + print('Flags: {0}'.format(', '.join(info.get('flags', '')))) + else: + sys.stderr.write("Failed to find cpu info\n") + sys.exit(1) + + +if __name__ == '__main__': + from multiprocessing import freeze_support + freeze_support() + main() +else: + _check_arch() diff --git a/doc/Makefile b/doc/Makefile new file mode 100644 index 0000000..3688c63 --- /dev/null +++ b/doc/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line. +SPHINXOPTS = +SPHINXBUILD = sphinx-build +SPHINXPROJ = PyTables +SOURCEDIR = source +BUILDDIR = build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/doc/make.bat b/doc/make.bat new file mode 100644 index 0000000..ae3ffd4 --- /dev/null +++ b/doc/make.bat @@ -0,0 +1,36 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=source +set BUILDDIR=build +set SPHINXPROJ=PyTables + +if "%1" == "" goto help + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.http://sphinx-doc.org/ + exit /b 1 +) + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% + +:end +popd diff --git a/doc/man/pt2to3.1 b/doc/man/pt2to3.1 new file mode 100644 index 0000000..68c88e9 --- /dev/null +++ b/doc/man/pt2to3.1 @@ -0,0 +1,55 @@ +.\" Hey, EMACS: -*- nroff -*- +.\" First parameter, NAME, should be all caps +.\" Second parameter, SECTION, should be 1-8, maybe w/ subsection +.\" other parameters are allowed: see man(7), man(1) +.TH PT2TO3 1 "May 15, 2013" +.\" Please adjust this date whenever revising the manpage. +.\" +.\" Some roff macros, for reference: +.\" .nh disable hyphenation +.\" .hy enable hyphenation +.\" .ad l left justify +.\" .ad b justify to both left and right margins +.\" .nf disable filling +.\" .fi enable filling +.\" .br insert line break +.\" .sp insert n+1 empty lines +.\" for manpage-specific macros, see man(7) +.SH NAME +pt2to3 \- PyTables 2.x \-> 3.x API transition tool. +.SH SYNOPSIS +.B pt2to3 +[ OPTIONS ] +.RB filename +.br +.SH DESCRIPTION +This tool displays to standard out, so it is common to pipe this to another +file: $ pt2to3 oldfile.py > newfile.py. + +.SH OPTIONS +A summary of options is included below. +.TP +.B \-r, \--reverse +Reverts changes, going from 3.x \-> 2.x. +.TP +.B \-p, \--no-ignore-previous +Ignores previous_api() calls. +.TP +.B \-o OUTPUT +Output file to write to.. +.TP +.B \-i, \--inplace +Overwrites the file in-place. +.TP +.B \-h +Print help on usage. + +.br + +.SH SEE ALSO +.BR ptrepack (1), ptdump (1). +.br +These utilities are documented fully by +.IR "PyTables user's manual". +.SH AUTHOR +This manual page was written by Antonio Valentino . diff --git a/doc/man/ptdump.1 b/doc/man/ptdump.1 new file mode 100644 index 0000000..010aed0 --- /dev/null +++ b/doc/man/ptdump.1 @@ -0,0 +1,63 @@ +.\" Hey, EMACS: -*- nroff -*- +.\" First parameter, NAME, should be all caps +.\" Second parameter, SECTION, should be 1-8, maybe w/ subsection +.\" other parameters are allowed: see man(7), man(1) +.TH PTDUMP 1 "July 7, 2007" +.\" Please adjust this date whenever revising the manpage. +.\" +.\" Some roff macros, for reference: +.\" .nh disable hyphenation +.\" .hy enable hyphenation +.\" .ad l left justify +.\" .ad b justify to both left and right margins +.\" .nf disable filling +.\" .fi enable filling +.\" .br insert line break +.\" .sp insert n+1 empty lines +.\" for manpage-specific macros, see man(7) +.SH NAME +ptdump \- Lets you dig into the contents of a \fIPyTables\fR file. +.SH SYNOPSIS +.B ptdump +.RB [\| \-dvacih \|] +.RB [\| \-R \| \ start,stop,step] +.RB file[:nodepath] +.br +.SH DESCRIPTION +Allows you look into the contents of your \fIPyTables\fR files. It lets +you see not only the data but also the metadata (that is, the structure +and additional information in the form of attributes). + +.SH OPTIONS +A summary of options is included below. +.TP +.B \-d +Dump data information on leaves. +.TP +.B \-v +Dump more metainformation on nodes. +.TP +.B \-a +Show attributes in nodes (only useful when \-v or \-d are active). +.TP +.B \-c +Show info of columns in tables (only useful when \-v or \-d are active). +.TP +.B \-i +Show info of indexed column (only useful when \-v or \-d are active). +.TP +.BI \-R\ start,stop,step +Select a range of rows in the form "start,stop,step" for \fIall\fR leaves. +.TP +.B \-h +Print help on usage. + +.br + +.SH SEE ALSO +.BR ptrepack (1). +.br +These utilities are documented fully by +.IR "PyTables user's manual". +.SH AUTHOR +This manual page was written by Francesc Altet . diff --git a/doc/man/ptrepack.1 b/doc/man/ptrepack.1 new file mode 100644 index 0000000..4eebf98 --- /dev/null +++ b/doc/man/ptrepack.1 @@ -0,0 +1,107 @@ +.\" Hey, EMACS: -*- nroff -*- +.\" First parameter, NAME, should be all caps +.\" Second parameter, SECTION, should be 1-8, maybe w/ subsection +.\" other parameters are allowed: see man(7), man(1) +.TH PTREPACK 1 "July 7, 2007" +.\" Please adjust this date whenever revising the manpage. +.\" +.\" Some roff macros, for reference: +.\" .nh disable hyphenation +.\" .hy enable hyphenation +.\" .ad l left justify +.\" .ad b justify to both left and right margins +.\" .nf disable filling +.\" .fi enable filling +.\" .br insert line break +.\" .sp insert n+1 empty lines +.\" for manpage-specific macros, see man(7) +.SH NAME +ptrepack \- Copy any PyTables Leaf, Group or complete subtree into another file. +.SH SYNOPSIS +.B ptrepack +.RB \-h +.RB \-v +.RB \-o +.RB \-R +.IR start,stop,step +.RB \-\-non\-recursive +.RB \-\-dest-title=title +.RB \-\-dont\-copyuser\-attrs +.RB \-\-overwrite\-nodes +.RB \-\-complevel=(0\-9) +.RB \-\-complib=lib +.RB \-\-shuffle=(0|1) +.RB \-\-fletcher32=(0|1) +.RB \-\-keep\-source\-filters +.RB \-\-upgrade\-flavors +.RB \-\-dont\-regenerate\-old\-indexes +.RB sourcefile:sourcegroup +.RB destfile:destgroup +.br +.SH DESCRIPTION +Copy any Leaf, Group or complete subtree from a PyTables file into another file. +.SH OPTIONS +A summary of options is included below. +.TP +.B \-h +Prints a help text. +.TP +.B \-v +Show more information. +.TP +.B \-o +Overwrite destination file. +.TP +.BI \-R\ RANGE +Select a RANGE of rows in the form \fIstart,stop,step\fR during the copy of +\fIall\fR the leaves. +.TP +.BI \-\-non\-recursive +Do not do a recursive copy. Default is to do it. +.TP +.BI \-\-dest\-title=title +Title for the new file (if not specified, the source is copied). +.TP +.BI \-\-dont\-copy\-userattrs +Do not copy the user attrs (default is to do it). +.TP +.BI \-\-overwrite\-nodes +Overwrite destination nodes if they exist. Default is not to overwrite +them. +.TP +.BI \-\-complevel=(0-9) +Set a compression level (0 for no compression, which is the default). +.TP +.BI \-\-complib=lib +Set the compression library to be used during the copy. \fIlib\fR can +be set to "zlib", "lzo", "ucl" or "bzip2". Defaults to "zlib". +.TP +.BI \-\-shuffle=(0|1) +Activate or not the shuffling filter (default is active if complevel>0). +.TP +.BI \-\-fletcher32=(0|1) +Whether to activate or not the fletcher32 filter (not active by default). +.TP +.BI \-\-keep\-source\-filters +Use the original filters in source files. The default is not doing that +if any of \-\-complevel, \-\-complib, \-\-shuffle or \-\-fletcher32 +option is specified. +.TP +.BI \-\-upgrade\-flavors +When repacking PyTables 1.x files, the flavor of leaves will be +unset. With this, such a leaves will be serialized as objects with the +internal flavor ("numpy" for 2.x series). +.TP +.BI \-\-dont\-regenerate\-old\-indexes +Disable regenerating old indexes. The default is to regenerate old +indexes as they are found. + +.br + +.SH SEE ALSO +.BR ptdump (1). +.br +These utilities are documented fully by +.IR "PyTables user's manual". +.SH AUTHOR +This manual page was written by Francesc Altet . diff --git a/doc/man/pttree.1 b/doc/man/pttree.1 new file mode 100644 index 0000000..04bdb6d --- /dev/null +++ b/doc/man/pttree.1 @@ -0,0 +1,79 @@ +.\" Hey, EMACS: -*- nroff -*- +.\" First parameter, NAME, should be all caps +.\" Second parameter, SECTION, should be 1-8, maybe w/ subsection +.\" other parameters are allowed: see man(7), man(1) +.TH pttree 1 "May 15, 2013" +.\" Please adjust this date whenever revising the manpage. +.\" +.\" Some roff macros, for reference: +.\" .nh disable hyphenation +.\" .hy enable hyphenation +.\" .ad l left justify +.\" .ad b justify to both left and right margins +.\" .nf disable filling +.\" .fi enable filling +.\" .br insert line break +.\" .sp insert n+1 empty lines +.\" for manpage-specific macros, see man(7) +.SH NAME +pttree \- prints a quick overview of the contents of PyTables HDF5 files. +.SH SYNOPSIS +.B pttree +[ OPTIONS ] +.RB filename[:nodepath] +.br +.SH DESCRIPTION +.B pttree +is designed to give a quick overview of the contents of a PyTables +HDF5 file by printing a depth-indented list of nodes, similar to the output of +the Unix `tree` function. It can also display the size, shape and compression +states of individual nodes, as well as summary information for the whole file. +For a more verbose output (including metadata), see `ptdump`. + +.SH OPTIONS +A summary of options is included below. +.TP +.B \-L MAX_DEPTH, \--max-level MAX_DEPTH +maximum branch depth of tree to display (-1 == no limit) +.TP +.B \-S SORT_BY, \--sort-by SORT_BY +artificially order nodes, can be either "size", "name" or "none" +.TP +.B \--print-size +print size of each node/branch +.TP +.B \--no-print-size +.TP +.B \--print-shape +print shape of each node +.TP +.B \--no-print-shape +.TP +.B \--print-compression +print compression library(level) for each compressed node +.TP +.B \--no-print-compression +.TP +.B \--print-percent +print size of each node as a % of the total tree size on disk +.TP +.B \--no-print-percent +.TP +.B \--use-si-units +report sizes in SI units (1 MB == 10^6 B) +.TP +.B \--use-binary-units +report sizes in binary units (1 MiB == 2^20 B) +.TP +.B \-h +Print help on usage. + +.br + +.SH SEE ALSO +.BR ptdump (1). +.br +These utilities are documented fully by +.IR "PyTables user's manual". +.SH AUTHOR +This manual page was written by Antonio Valentino . diff --git a/doc/scripts/filenode.py b/doc/scripts/filenode.py new file mode 100644 index 0000000..bad21df --- /dev/null +++ b/doc/scripts/filenode.py @@ -0,0 +1,45 @@ +# Copy this file into the clipboard and paste into 'script -c python'. + +import tables as tb + +h5file = tb.open_file('fnode.h5', 'w') + +fnode = tb.nodes.FileNode.new_node(h5file, where='/', name='fnode_test') + +print(h5file.getAttrNode('/fnode_test', 'NODE_TYPE')) + +print("This is a test text line.", file=fnode) +print("And this is another one.", file=fnode) +print(file=fnode) +fnode.write("Of course, file methods can also be used.") + +fnode.seek(0) # Go back to the beginning of file. + +for line in fnode: + print(repr(line)) + +fnode.close() +print(fnode.closed) + +node = h5file.root.fnode_test +fnode = tb.nodes.FileNode.open_node(node, 'a+') +print(repr(fnode.readline())) +print(fnode.tell()) +print("This is a new line.", file=fnode) +print(repr(fnode.readline())) + +fnode.seek(0) +for line in fnode: + print(repr(line)) + +fnode.attrs.content_type = 'text/plain; charset=us-ascii' + +fnode.attrs.author = "Ivan Vilata i Balaguer" +fnode.attrs.creation_date = '2004-10-20T13:25:25+0200' +fnode.attrs.keywords_en = ["FileNode", "test", "metadata"] +fnode.attrs.keywords_ca = ["FileNode", "prova", "metadades"] +fnode.attrs.owner = 'ivan' +fnode.attrs.acl = {'ivan': 'rw', '@users': 'r'} + +fnode.close() +h5file.close() diff --git a/doc/scripts/pickletrouble.py b/doc/scripts/pickletrouble.py new file mode 100644 index 0000000..710a705 --- /dev/null +++ b/doc/scripts/pickletrouble.py @@ -0,0 +1,27 @@ +import tables as tb + + +class MyClass: + foo = 'bar' + + +# An object of my custom class. +myObject = MyClass() + +with tb.open_file('test.h5', 'w') as h5f: + h5f.root._v_attrs.obj = myObject # store the object + print(h5f.root._v_attrs.obj.foo) # retrieve it + +# Delete class of stored object and reopen the file. +del MyClass, myObject + +with tb.open_file('test.h5', 'r') as h5f: + print(h5f.root._v_attrs.obj.foo) + # Let us inspect the object to see what is happening. + print(repr(h5f.root._v_attrs.obj)) + # Maybe unpickling the string will yield more information: + import pickle + + pickle.loads(h5f.root._v_attrs.obj) + # So the problem was not in the stored object, + # but in the *environment* where it was restored. diff --git a/doc/scripts/tutorial1.py b/doc/scripts/tutorial1.py new file mode 100644 index 0000000..66ec58e --- /dev/null +++ b/doc/scripts/tutorial1.py @@ -0,0 +1,302 @@ +"""Small but quite comprehensive example showing the use of PyTables. + +The program creates an output file, 'tutorial1.h5'. You can view it +with any HDF5 generic utility. + +""" + +import os +import sys +import traceback + +SECTION = "I HAVE NO TITLE" + + +def tutsep(): + print('----8<----', SECTION, '----8<----') + + +def tutprint(obj): + tutsep() + print(obj) + + +def tutrepr(obj): + tutsep() + print(repr(obj)) + + +def tutexc(): + tutsep() + traceback.print_exc(file=sys.stdout) + + +SECTION = "Importing tables objects" +import numpy as np +import tables as tb + +SECTION = "Declaring a Column Descriptor" + + +# Define a user record to characterize some kind of particles +class Particle(tb.IsDescription): + name = tb.StringCol(16) # 16-character String + idnumber = tb.Int64Col() # Signed 64-bit integer + ADCcount = tb.UInt16Col() # Unsigned short integer + TDCcount = tb.UInt8Col() # unsigned byte + grid_i = tb.Int32Col() # integer + grid_j = tb.IntCol() # integer (equivalent to Int32Col) + pressure = tb.Float32Col() # float (single-precision) + energy = tb.FloatCol() # double (double-precision) + + +SECTION = "Creating a PyTables file from scratch" +# Open a file in "w"rite mode +h5file = tb.open_file('tutorial1.h5', mode="w", title="Test file") + +SECTION = "Creating a new group" +# Create a new group under "/" (root) +group = h5file.create_group("/", 'detector', 'Detector information') + +SECTION = "Creating a new table" +# Create one table on it +table = h5file.create_table(group, 'readout', Particle, "Readout example") + +tutprint(h5file) +tutrepr(h5file) + +# Get a shortcut to the record object in table +particle = table.row + +# Fill the table with 10 particles +for i in range(10): + particle['name'] = f'Particle: {i:6d}' + particle['TDCcount'] = i % 256 + particle['ADCcount'] = (i * 256) % (1 << 16) + particle['grid_i'] = i + particle['grid_j'] = 10 - i + particle['pressure'] = float(i * i) + particle['energy'] = float(particle['pressure'] ** 4) + particle['idnumber'] = i * (2 ** 34) + # Insert a new particle record + particle.append() + +# Flush the buffers for table +table.flush() + +SECTION = "Reading (and selecting) data in a table" +# Read actual data from table. We are interested in collecting pressure values +# on entries where TDCcount field is greater than 3 and pressure less than 50 +table = h5file.root.detector.readout +pressure = [ + x['pressure'] + for x in table + if x['TDCcount'] > 3 and 20 <= x['pressure'] < 50 +] + +tutrepr(pressure) + +# Read also the names with the same cuts +names = [ + x['name'] + for x in table + if x['TDCcount'] > 3 and 20 <= x['pressure'] < 50 +] + +tutrepr(names) + +SECTION = "Creating new array objects" +gcolumns = h5file.create_group(h5file.root, "columns", "Pressure and Name") + +tutrepr( + h5file.create_array(gcolumns, 'pressure', np.array(pressure), + "Pressure column selection") +) + +tutrepr( + h5file.create_array('/columns', 'name', names, "Name column selection") +) + +tutprint(h5file) + +SECTION = "Closing the file and looking at its content" +# Close the file +h5file.close() + +tutsep() +os.system('h5ls -rd tutorial1.h5') +tutsep() +os.system('ptdump tutorial1.h5') + +"""This example shows how to browse the object tree and enlarge tables. + +Before to run this program you need to execute first tutorial1-1.py +that create the tutorial1.h5 file needed here. + +""" + +SECTION = "Traversing the object tree" +# Reopen the file in append mode +h5file = tb.open_file("tutorial1.h5", "a") + +# Print the object tree created from this filename +# List all the nodes (Group and Leaf objects) on tree +tutprint(h5file) + +# List all the nodes (using File iterator) on tree +tutsep() +for node in h5file: + print(node) + +# Now, only list all the groups on tree +tutsep() +for group in h5file.walk_groups("/"): + print(group) + +# List only the arrays hanging from / +tutsep() +for group in h5file.walk_groups("/"): + for array in h5file.list_nodes(group, classname='Array'): + print(array) + +# This gives the same result +tutsep() +for array in h5file.walk_nodes("/", "Array"): + print(array) + +# And finally, list only leafs on /detector group (there should be one!) +# Other way using iterators and natural naming +tutsep() +for leaf in h5file.root.detector('Leaf'): + print(leaf) + +SECTION = "Setting and getting user attributes" +# Get a pointer to '/detector/readout' +table = h5file.root.detector.readout + +# Attach it a string (date) attribute +table.attrs.gath_date = "Wed, 06/12/2003 18:33" + +# Attach a floating point attribute +table.attrs.temperature = 18.4 +table.attrs.temp_scale = "Celsius" + +# Get a pointer to '/detector' +detector = h5file.root.detector +# Attach a general object to the parent (/detector) group +detector._v_attrs.stuff = [5, (2.3, 4.5), "Integer and tuple"] + +# Now, get the attributes +tutrepr(table.attrs.gath_date) +tutrepr(table.attrs.temperature) +tutrepr(table.attrs.temp_scale) +tutrepr(detector._v_attrs.stuff) + +# Delete permanently the attribute gath_date of /detector/readout +del table.attrs.gath_date + +# Print a representation of all attributes in /detector/table +tutrepr(table.attrs) + +# Get the (user) attributes of /detector/table +tutprint(table.attrs._f_list("user")) + +# Get the (sys) attributes of /detector/table +tutprint(table.attrs._f_list("sys")) + +# Rename an attribute +table.attrs._f_rename("temp_scale", "tempScale") +tutprint(table.attrs._f_list()) + +# Try to rename a system attribute: +try: + table.attrs._f_rename("VERSION", "version") +except: + tutexc() + +h5file.flush() +tutsep() +os.system('h5ls -vr tutorial1.h5/detector/readout') + +SECTION = "Getting object metadata" +# Get metadata from table +tutsep() +print("Object:", table) +tutsep() +print("Table name:", table.name) +tutsep() +print("Table title:", table.title) +tutsep() +print("Number of rows in table:", table.nrows) +tutsep() +print("Table variable names with their type and shape:") +tutsep() +for name in table.colnames: + print(f'{name}:= {table.coltypes[name]}, {table.colshapes[name]}') + +tutprint(table.__doc__) + +# Get the object in "/columns pressure" +pressureObject = h5file.get_node("/columns", "pressure") + +# Get some metadata on this object +tutsep() +print(f"Info on the object: {pressureObject!r}") +tutsep() +print(f" shape: ==> {pressureObject.shape}") +tutsep() +print(f" title: ==> {pressureObject.title}") +tutsep() +print(f" type: ==> {pressureObject.type}") + +SECTION = "Reading data from Array objects" +# Read the 'pressure' actual data +pressureArray = pressureObject.read() +tutrepr(pressureArray) +tutsep() +print(f"pressureArray is an object of type: {type(pressureArray)}") + +# Read the 'name' Array actual data +nameArray = h5file.root.columns.name.read() +tutrepr(nameArray) +print(f"nameArray is an object of type: {type(nameArray)}") + +# Print the data for both arrays +tutprint("Data on arrays nameArray and pressureArray:") +tutsep() +for i in range(pressureObject.shape[0]): + print(f"{nameArray[i]} --> {pressureArray[i]}") +tutrepr(pressureObject.name) + +SECTION = "Appending data to an existing table" +# Create a shortcut to table object +table = h5file.root.detector.readout +# Get the object row from table +particle = table.row + +# Append 5 new particles to table +for i in range(10, 15): + particle['name'] = f'Particle: {i:6d}' + particle['TDCcount'] = i % 256 + particle['ADCcount'] = (i * 256) % (1 << 16) + particle['grid_i'] = i + particle['grid_j'] = 10 - i + particle['pressure'] = float(i * i) + particle['energy'] = float(particle['pressure'] ** 4) + particle['idnumber'] = i * (2 ** 34) # This exceeds long integer range + particle.append() + +# Flush this table +table.flush() + +# Print the data using the table iterator: +tutsep() +for r in table: + print(f"{r['name']:<16s} | {r['pressure']:11.1f} | {r['energy']:11.4g} | " + f"{r['grid_i']:6d} | {r['grid_j']:6d} | {r['TDCcount']:8d} |") + +# Delete some rows on the Table (yes, rows can be removed!) +tutrepr(table.remove_rows(5, 10)) + +# Close the file +h5file.close() diff --git a/doc/source/FAQ.rst b/doc/source/FAQ.rst new file mode 100644 index 0000000..e204f6f --- /dev/null +++ b/doc/source/FAQ.rst @@ -0,0 +1,520 @@ +:author: FrancescAlted +:date: 2011-06-13 08:40:20 + +.. py:currentmodule:: tables + +=== +FAQ +=== + +General questions +================= + +What is PyTables? +----------------- + +PyTables is a package for managing hierarchical datasets designed to +efficiently cope with extremely large amounts of data. + +It is built on top of the HDF5_ library, the `Python language`_ and the +NumPy_ package. +It features an object-oriented interface that, combined with C extensions +for the performance-critical parts of the code, makes it a fast yet +extremely easy-to-use tool for interactively storing and retrieving very +large amounts of data. + + +What are PyTables' licensing terms? +----------------------------------- + +PyTables is free for both commercial and non-commercial use, under the terms +of the `BSD 3-Clause License `_. + + +I'm having problems. How can I get support? +------------------------------------------- + +The most common and efficient way is to subscribe (remember you *need* to +subscribe prior to send messages) to the PyTables `users mailing list`_, and +send there a brief description of your issue and, if possible, a short script +that can reproduce it. +Hopefully, someone on the list will be able to help you. +It is also a good idea to check out the `archives of the user's list`_ (you may +want to check the `Gmane archives`_ instead) so as to see if the answer to your +question has already been dealt with. + + +Why HDF5? +--------- + +HDF5_ is the underlying C library and file format that enables PyTables to +efficiently deal with the data. It has been chosen for the following reasons: + +* Designed to efficiently manage very large datasets. +* Lets you organize datasets hierarchically. +* Very flexible and well tested in scientific environments. +* Good maintenance and improvement rate. +* Technical excellence (`R&D 100 Award`_). +* **It's Open Source software** + + +Why Python? +----------- + +1. Python is interactive. + + People familiar with data processing understand how powerful command line + interfaces are for exploring mathematical relationships and scientific data + sets. Python provides an interactive environment with the added benefit of + a full featured programming language behind it. + +2. Python is productive for beginners and experts alike. + + PyTables is targeted at engineers, scientists, system analysts, financial + analysts, and others who consider programming a necessary evil. Any time + spent learning a language or tracking down bugs is time spent not solving + their real problem. Python has a short learning curve and most people can + do real and useful work with it in a day of learning. Its clean syntax and + interactive nature facilitate this. + +3. Python is data-handling friendly. + + Python comes with nice idioms that make the access to data much easier: + general slicing (i.e. ``data[start:stop:step]``), list comprehensions, + iterators, generators ... are constructs that make the interaction with your + data very easy. + + +Why NumPy? +---------- + +NumPy_ is a Python package to efficiently deal with large datasets +**in-memory**, providing containers for homogeneous data, heterogeneous data, +and string arrays. +PyTables uses these NumPy containers as *in-memory buffers* to push the I/O +bandwith towards the platform limits. + + +Where can PyTables be applied? +============================== + +In all the scenarios where one needs to deal with large datasets: + +* Industrial applications + + - Data acquisition in real time + - Quality control + - Fast data processing + +* Scientific applications + + - Meteorology, oceanography + - Numerical simulations + - Medicine (biological sensors, general data gathering & processing) + +* Information systems + + - System log monitoring & consolidation + - Tracing of routing data + - Alert systems in security + + +Is PyTables safe? +----------------- + +Well, first of all, let me state that PyTables does not support transactional +features yet (we don't even know if we will ever be motivated to implement +this!), so there is always the risk that you can lose your data in case of an +unexpected event while writing (like a power outage, system shutdowns ...). +Having said that, if your typical scenarios are *write once, read many*, then +the use of PyTables is perfectly safe, even for dealing extremely large amounts +of data. + + +Can PyTables be used in concurrent access scenarios? +---------------------------------------------------- + +It depends. Concurrent reads are no problem at all. However, whenever a process +(or thread) is trying to write, then problems will start to appear. First, +PyTables doesn't support locking at any level, so several process writing +concurrently to the same PyTables file will probably end up corrupting it, so +don't do this! Even having only one process writing and the others reading is +a hairy thing, because the reading processes might be reading incomplete data +from a concurrent data writing operation. + +The solution would be to lock the file while writing and unlock it after a +flush over the file has been performed. Also, in order to avoid cache (HDF5_, +PyTables) problems with read apps, you would need to re-open your files +whenever you are going to issue a read operation. If a re-opening operation is +unacceptable in terms of speed, you may want to do all your I/O operations in +one single process (or thread) and communicate the results via sockets, +:class:`Queue.Queue` objects (in case of using threads), or whatever, with the +client process/thread. + +The `examples` directory contains two scripts demonstrating methods of +accessing a PyTables file from multiple processes. + +The first, *multiprocess_access_queues.py*, uses a +:class:`multiprocessing.Queue` object to transfer read and write requests from +multiple *DataProcessor* processes to a single process responsible for all +access to the PyTables file. The results of read requests are then transferred +back to the originating processes using other :class:`Queue` objects. + +The second example script, *multiprocess_access_benchmarks.py*, demonstrates +and benchmarks four methods of transferring PyTables array data between +processes. The four methods are: + + * Using :class:`multiprocessing.Pipe` from the Python standard library. + * Using a memory mapped file that is shared between two processes. The NumPy + array associated with the file is passed as the *out* argument to the + :meth:`tables.Array.read` method. + * Using a Unix domain socket. Note that this example uses the 'abstract + namespace' and will only work under Linux. + * Using an IPv4 socket. + +See also the discussion in :issue:`790`. + + +What kind of containers does PyTables implement? +------------------------------------------------ + +PyTables does support a series of data containers that address specific needs +of the user. Below is a brief description of them: + +::class:`Table`: + Lets you deal with heterogeneous datasets. Allows compression. Enlargeable. + Supports nested types. Good performance for read/writing data. +::class:`Array`: + Provides quick and dirty array handling. Not compression allowed. + Not enlargeable. Can be used only with relatively small datasets (i.e. + those that fit in memory). It provides the fastest I/O speed. +::class:`CArray`: + Provides compressed array support. Not enlargeable. Good speed when + reading/writing. +::class:`EArray`: + Most general array support. Compressible and enlargeable. It is pretty + fast at extending, and very good at reading. +::class:`VLArray`: + Supports collections of homogeneous data with a variable number of entries. + Compressible and enlargeable. I/O is not very fast. +::class:`Group`: + The structural component. + A hierarchically-addressable container for HDF5 nodes (each of these + containers, including Group, are nodes), similar to a directory in a + UNIX filesystem. + +Please refer to the :doc:`usersguide/libref` for more specific information. + + +Cool! I'd like to see some examples of use. +------------------------------------------- + +Sure. Go to the HowToUse section to find simple examples that will help you +getting started. + + +Can you show me some screenshots? +--------------------------------- + +Well, PyTables is not a graphical library by itself. However, you may want to +check out ViTables_, a GUI tool to browse and edit PyTables & HDF5_ files. + + +Is PyTables a replacement for a relational database? +---------------------------------------------------- + +No, by no means. PyTables lacks many features that are standard in most +relational databases. In particular, it does not have support for +relationships (beyond the hierarchical one, of course) between datasets and it +does not have transactional features. PyTables is more focused on speed and +dealing with really large datasets, than implementing the above features. In +that sense, PyTables can be best viewed as a *teammate* of a relational +database. + +For example, if you have very large tables in your existing relational +database, they will take lots of space on disk, potentially reducing the +performance of the relational engine. In such a case, you can move those huge +tables out of your existing relational database to PyTables, and let your +relational engine do what it does best (i.e. manage relatively small or medium +datasets with potentially complex relationships), and use PyTables for what it +has been designed for (i.e. manage large amounts of data which are loosely +related). + + +How can PyTables be fast if it is written in an interpreted language like Python? +--------------------------------------------------------------------------------- + +Actually, all of the critical I/O code in PyTables is a thin layer of code on +top of HDF5_, which is a very efficient C library. Cython_ is used as the +*glue* language to generate "wrappers" around HDF5 calls so that they can be +used in Python. Also, the use of an efficient numerical package such as NumPy_ +makes the most costly operations effectively run at C speed. Finally, +time-critical loops are usually implemented in Cython_ (which, if used +properly, allows to generate code that runs at almost pure C speeds). + + +If it is designed to deal with very large datasets, then PyTables should consume a lot of memory, shouldn't it? +--------------------------------------------------------------------------------------------------------------- + +Well, you already know that PyTables sits on top of HDF5, Python and NumPy_, +and if we add its own logic (~7500 lines of code in Python, ~3000 in Cython and +~4000 in C), then we should conclude that PyTables isn't effectively a paradigm +of lightness. + +Having said that, PyTables (as HDF5_ itself) tries very hard to optimize the +memory consumption by implementing a series of features like dynamic +determination of buffer sizes, *Least Recently Used* cache for keeping unused +nodes out of memory, and extensive use of compact NumPy_ data containers. +Moreover, PyTables is in a relatively mature state and most memory leaks have +been already addressed and fixed. + +Just to give you an idea of what you can expect, a PyTables program can deal +with a table with around 30 columns and 1 million entries using as low as 13 MB +of memory (on a 32-bit platform). All in all, it is not that much, is it?. + + +Why was PyTables born? +---------------------- + +Because, back in August 2002, one of its authors (`Francesc Alted`_) had a need +to save lots of hierarchical data in an efficient way for later post-processing +it. After trying out several approaches, he found that they presented distinct +inconveniences. For example, working with file sizes larger than, say, 100 MB, +was rather painful with ZODB (it took lots of memory with the version available +by that time). + +The netCDF3_ interface provided by `Scientific Python`_ was great, but it did +not allow to structure the hierarchically; besides, netCDF3_ only supports +homogeneous datasets, not heterogeneous ones (i.e. tables). (As an aside, +netCDF4_ overcomes many of the limitations of netCDF3_, although curiously +enough, it is based on top of HDF5_, the library chosen as the base for +PyTables from the very beginning.) + +So, he decided to give HDF5_ a try, start doing his own wrappings to it and +voilà, this is how the first public release of PyTables (0.1) saw the light in +October 2002, three months after his itch started to eat him ;-). + + +How does PyTables compare with the h5py project? +------------------------------------------------ + +Well, they are similar in that both packages are Python interfaces to the HDF5_ +library, but there are some important differences to be noted. h5py_ is an +attempt to map the HDF5_ feature set to NumPy_ as closely as possible. In +addition, it also provides access to nearly all of the HDF5_ C API. + +Instead, PyTables builds up an additional abstraction layer on top of HDF5_ and +NumPy_ where it implements things like an enhanced type system, an :ref:`engine +for enabling complex queries `, an efficient computational +kernel, advanced indexing capabilities or an undo/redo feature, to name +just a few. This additional layer also allows PyTables to be relatively +independent of its underlying libraries (and their possible limitations). For +example, PyTables can support HDF5_ data types like `enumerated` or `time` that +are available in the HDF5_ library but not in the NumPy_ package; or even +perform powerful complex queries that are not implemented directly in neither +HDF5_ nor NumPy_. + +Furthermore, PyTables also tries hard to be a high performance interface to +HDF5/NumPy, implementing niceties like internal LRU caches for nodes and other +data and metadata, :ref:`automatic computation of optimal chunk sizes +` for the datasets, a variety of compressors, ranging from +slow but efficient (bzip2_) to extremely fast ones (Blosc_) in addition to the +standard `zlib`_. Another difference is that PyTables makes use of numexpr_ so +as to accelerate internal computations (for example, in evaluating complex +queries) to a maximum. + +For contrasting with other opinions, you may want to check the PyTables/h5py +comparison in a similar entry of the `FAQ of h5py`_. + + +I've found a bug. What do I do? +-------------------------------- + +The PyTables development team works hard to make this eventuality as rare as +possible, but, as in any software made by human beings, bugs do occur. If you +find any bug, please tell us by file a bug report in the `issue tracker`_ on +GitHub_. + + +Is it possible to get involved in PyTables development? +------------------------------------------------------- + +Indeed. We are keen for more people to help out contributing code, unit tests, +documentation, and helping out maintaining this wiki. Drop us a mail on the +`users mailing list` and tell us in which area do you want to work. + + +How can I cite PyTables? +------------------------ + +The recommended way to cite PyTables in a paper or a presentation is as +following: + +* Author: Francesc Alted, Ivan Vilata and others +* Title: PyTables: Hierarchical Datasets in Python +* Year: 2002 - +* URL: http://www.pytables.org + +Here's an example of a BibTeX entry:: + + @Misc{, + author = {PyTables Developers Team}, + title = {{PyTables}: Hierarchical Datasets in {Python}}, + year = {2002--}, + url = "http://www.pytables.org/" + } + + +PyTables 2.x issues +=================== + +I'm having problems migrating my apps from PyTables 1.x into PyTables 2.x. Please, help! +---------------------------------------------------------------------------------------- + +Sure. However, you should first check out the :doc:`MIGRATING_TO_2.x` +document. +It should provide hints to the most frequently asked questions on this regard. + + +For combined searches like `table.where('(x<5) & (x>3)')`, why was a `&` operator chosen instead of an `and`? +------------------------------------------------------------------------------------------------------------- + +Search expressions are in fact Python expressions written as strings, and they +are evaluated as such. This has the advantage of not having to learn a new +syntax, but it also implies some limitations with logical `and` and `or` +operators, namely that they can not be overloaded in Python. Thus, it is +impossible right now to get an element-wise operation out of an expression like +`'array1 and array2'`. That's why one has to choose some other operator, being +`&` and `|` the most similar to their C counterparts `&&` and `||`, which +aren't available in Python either. + +You should be careful about expressions like `'x<5 & x>3'` and others like `'3 +< x < 5'` which ''won't work as expected'', because of the different operator +precedence and the absence of an overloaded logical `and` operator. More on +this in the appendix about condition syntax in the `HDF5 manual`_. + +There are quite a few packages affected by those limitations including NumPy_ +themselves and SQLObject_, and there have been quite longish discussions about +adding the possibility of overloading logical operators to Python (see `PEP +335`_ and `this thread`__ for more details). + +__ https://mail.python.org/pipermail/python-dev/2004-September/048763.html + + +I can not select rows using in-kernel queries with a condition that involves an UInt64Col. Why? +----------------------------------------------------------------------------------------------- + +This turns out to be a limitation of the numexpr_ package. Internally, +numexpr_ uses a limited set of types for doing calculations, and unsigned +integers are always upcasted to the immediate signed integer that can fit the +information. The problem here is that there is not a (standard) signed integer +that can be used to keep the information of a 64-bit unsigned integer. + +So, your best bet right now is to avoid `uint64` types if you can. If you +absolutely need `uint64`, the only way for doing selections with this is +through regular Python selections. For example, if your table has a `colM` +column which is declared as an `UInt64Col`, then you can still filter its +values with:: + + [row['colN'] for row in table if row['colM'] < X] + + +However, this approach will generally lead to slow speed (specially on Win32 +platforms, where the values will be converted to Python `long` values). + + +I'm already using PyTables 2.x but I'm still getting numarray objects instead of NumPy ones! +-------------------------------------------------------------------------------------------- + +This is most probably due to the fact that you are using a file created with +PyTables 1.x series. By default, PyTables 1.x was setting an HDF5 attribute +`FLAVOR` with the value `'numarray'` to all leaves. Now, PyTables 2.x sees +this attribute and obediently converts the internal object (truly a NumPy +object) into a `numarray` one. For PyTables 2.x files the `FLAVOR` attribute +will only be saved when explicitly set via the `leaf.flavor` property (or when +passing data to an :class:`Array` or :class:`Table` at creation time), so you +will be able to distinguish default flavors from user-set ones by checking the +existence of the `FLAVOR` attribute. + +Meanwhile, if you don't want to receive `numarray` objects when reading old +files, you have several possibilities: + +* Remove the flavor for your datasets by hand:: + + for leaf in h5file.walkNodes(classname='Leaf'): + del leaf.flavor + +* Use the :program:'ptrepack` utility with the flag `--upgrade-flavors` + so as to convert all flavors in old files to the default (effectively by + removing the `FLAVOR` attribute). +* Remove the `numarray` (and/or `Numeric`) package from your system. + Then PyTables 2.x will return you pure NumPy objects (it can't be + otherwise!). + + +Installation issues +=================== + +Windows +------- + +Error when importing tables +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +You have installed the binary installer for Windows and, when importing the +*tables* package you are getting an error like:: + + The command in "0x6714a822" refers to memory in "0x012011a0". The + procedure "written" could not be executed. + Click to ok to terminate. + Click to abort to debug the program. + +This problem can be due to a series of reasons, but the most probable one is +that you have a version of a DLL library that is needed by PyTables and it is +not at the correct version. Please, double-check the versions of the required +libraries for PyTables and install newer versions, if needed. In most cases, +this solves the issue. + +In case you continue getting problems, there are situations where other +programs do install libraries in the PATH that are **optional** to PyTables +(for example BZIP2 or LZO), but that they will be used if they are found in +your system (i.e. anywhere in your :envvar:`PATH`). So, if you find any of +these libraries in your PATH, upgrade it to the latest version available (you +don't need to re-install PyTables). + + +----- + + +.. target-notes:: + +.. _HDF5: http://www.hdfgroup.org/HDF5 +.. _`Python language`: http://www.python.org +.. _NumPy: http://www.numpy.org +.. _`users mailing list`: https://groups.google.com/group/pytables-users +.. _`archives of the user's list`: https://sourceforge.net/p/pytables/mailman/pytables-users/ +.. _`Gmane archives`: http://www.mail-archive.com/pytables-users@lists.sourceforge.net/ +.. _`R&D 100 Award`: http://www.hdfgroup.org/HDF5/RD100-2002/ +.. _ViTables: http://vitables.org +.. _Cython: http://www.cython.org +.. _`Francesc Alted`: https://github.com/FrancescAlted +.. _netCDF3: http://www.unidata.ucar.edu/software/netcdf +.. _`Scientific Python`: http://dirac.cnrs-orleans.fr/ScientificPython.html +.. _netCDF4: http://www.unidata.ucar.edu/software/netcdf +.. _OPeNDAP: http://opendap.org +.. _`PyTables Manual`: http://www.pytables.org/usersguide/index.html +.. _h5py: http://www.h5py.org +.. _bzip2: http://www.bzip.org +.. _Blosc: https://www.blosc.org +.. _`zlib`: http://zlib.net +.. _numexpr: https://github.com/pydata/numexpr +.. _`FAQ of h5py`: http://docs.h5py.org/en/latest/faq.html#what-s-the-difference-between-h5py-and-pytables +.. _`issue tracker`: https://github.com/PyTables/PyTables/issues +.. _GitHub: https://github.com +.. _`HDF5 manual`: https://portal.hdfgroup.org/display/HDF5/Datatypes +.. _SQLObject: http://sqlobject.org +.. _`PEP 335`: http://www.python.org/dev/peps/pep-0335 + + +.. todo:: fix links that point to wiki pages + diff --git a/doc/source/MIGRATING_TO_2.x.rst b/doc/source/MIGRATING_TO_2.x.rst new file mode 100644 index 0000000..f4a81a3 --- /dev/null +++ b/doc/source/MIGRATING_TO_2.x.rst @@ -0,0 +1,268 @@ +================================== +Migrating from PyTables 1.x to 2.x +================================== + +:Author: Francesc Alted i Abad +:Contact: faltet@pytables.com +:Author: Ivan Vilata i Balaguer +:Contact: ivan@selidor.net + + +Next are described a series of issues that you must have in mind when +migrating from PyTables 1.x to PyTables 2.x series. + + +New type system +=============== + +In PyTables 2.x all the data types for leaves are described through a couple +of classes: + +- ``Atom``: Describes homogeneous types of the atomic components in ``*Array`` + objects (``Array``, ``CArray``, ``EArray`` and ``VLArray``). + +- ``Description``: Describes (possibly nested) heterogeneous types in + ``Table`` objects. + +So, in order to upgrade to the new type system, you must perform the next +replacements: + +- ``*Array.stype`` --> ``*Array.atom.type`` (PyTables type) +- ``*Array.type`` --> ``*Array.atom.dtype`` (NumPy type) +- ``*Array.itemsize`` --> ``*Array.atom.itemsize`` (the size of the item) + +Furthermore, the PyTables types (previously called "string types") have +changed to better adapt to NumPy conventions. The next changes have been +applied: + +- PyTables types are now written in lower case, so 'Type' becomes 'type'. For + example, 'Int64' becomes now 'int64'. + +- 'CharType' --> 'string' + +- 'Complex32', 'Complex64' --> 'complex64', 'complex128'. Note that the + numeric part of a 'complex' type refers now to the *size in bits* of the + type and not to the precision, as before. + +See Appendix I of the Users' Manual on supported data types for more +information on the new PyTables types. + + +Important changes in ``Atom`` specification +=========================================== + +- The ``dtype`` argument of ``EnumAtom`` and ``EnumCol`` constructors + has been replaced by the ``base`` argument, which can take a + full-blown atom, although it accepts bare PyTables types as well. + This is a *mandatory* argument now. + +- ``vlstring`` pseudo-atoms used in ``VLArray`` nodes do no longer imply UTF-8 + (nor any other) encoding, they only store and load *raw strings of bytes*. + All encoding and decoding is left to the user. Be warned that reading old + files may yield raw UTF-8 encoded strings, which may be converted back to + Unicode in this way:: + + unistr = vlarray[index].decode('utf-8') + + If you need to work with variable-length Unicode strings, you may want to + use the new ``vlunicode`` pseudo-atom, which fully supports Unicode strings + with no encoding hassles. + +- Finally, ``Atom`` and ``Col`` are now abstract classes, so you can't use + them to create atoms or column definitions of an arbitrary type. If you + know the particular type you need, use the proper subclass; otherwise, use + the ``Atom.from_*()`` or ``Col.from_*()`` factory methods. See the section + on declarative classes in the reference. + + You are also advised to avoid using the inheritance of atoms to check for + their kind or type; for that purpose, use their ``kind`` and ``type`` + attributes. + + +New query system +================ + +- In-kernel conditions, since they are based now in Numexpr, must be written + *as strings*. For example, a condition that in 1.x was stated as:: + + result = [row['col2'] for row in table.where(table.cols.col1 == 1)] + + now should read:: + + result = [row['col2'] for row in table.where('col1 == 1')] + + That means that complex selections are possible now:: + + result = [ row['col2'] for row in + table.where('(col1 == 1) & (col3**4 > 1)') ] + +- For the same reason, conditions for indexed columns must be written as + strings as well. + + +New indexing system +=================== + +The indexing system has been totally rewritten from scratch for PyTables 2.0 +Pro Edition. The new indexing system has been included into PyTables with +release 2.3. Due to this, your existing indexes created with PyTables 1.x +will be useless, and although you will be able to continue using the actual +data in files, you won't be able to take advantage of any improvement in +speed. + +You will be offered the possibility to automatically re-create the indexes +in PyTables 1.x format to the new 2.0 format by using the ``ptrepack`` +utility. + + +New meanings for atom shape and ``*Array`` shape argument +========================================================= + +With PyTables 1.x, the atom shape was used for different goals depending on +the context it was used. For example, in ``createEArray()``, the shape of the +atom was used to specify the *dataset shape* of the object on disk, while in +``CArray`` the same atom shape was used to specify the *chunk shape* of the +dataset on disk. Moreover, for ``VLArray`` objects, the very same atom shape +specified the *type shape* of the data type. As you see, all of these was +quite a mess. + +Starting with PyTables 2.x, an ``Atom`` only specifies properties of the data +type (à la ``VLArray`` in 1.x). This lets the door open for specifying +multidimensional data types (that can be part of another layer of +multidimensional datasets) in a consistent way along all the ``*Array`` +objects in PyTables. + +As a consequence of this, ``File.createCArray()`` and ``File.createVLArray()`` +methods have received new parameters in order to make possible to specify the +shapes of the datasets as well as chunk sizes (in fact, it is possible now to +specify the latter for all the chunked leaves, see below). Please have this +in mind during the migration process. + +Another consequence is that, now that the meaning of the atom shape is clearly +defined, it has been chosen as the main object to describe homogeneous data +types in PyTables. See the Users' Manual for more info on this. + + +New argument ``chunkshape`` of chunked leaves +============================================= + +It is possible now to specify the chunk shape for all the chunked leaves in +PyTables (all except ``Array``). With PyTables 1.x this value was +automatically calculated so as to achieve decent results in most of the +situations. However, the user may be interested in specifying its own chunk +shape based on her own needs (although this should be done only by advanced +users). + +Of course, if this parameter is not specified, a sensible default is +calculated for the size of the leave (which is recommended). + +A new attribute called ``chunkshape`` has been added to all leaves. It is +read-only (you can't change the size of chunks once you have created a leaf), +but it can be useful for inspection by advanced users. + + +New flavor specification +======================== + +As of 2.x, flavors can *only* be set through the ``flavor`` attribute of +leaves, and they are *persistent*, so changing a flavor requires that the file +be writable. + +Flavors can no longer be set through ``File.create*()`` methods, nor the +``flavor`` argument previously found in some ``Table`` methods, nor through +``Atom`` constructors or the ``_v_flavor`` attribute of descriptions. + + +System attributes can be deleted now +==================================== + +The protection against removing system attributes (like ``FILTERS``, +``FLAVOR`` or ``CLASS``, to name only a few) has been completely removed. It +is now the responsibility of the user to make a proper use of this freedom. +With this, users can get rid of all proprietary PyTables attributes if they +want to (for example, for making a file to look more like an HDF5 native one). + + +Byteorder issues +================ + +Now, all the data coming from reads and internal buffers is always converted +on-the-fly, if needed, to the *native* byteorder. This represents a big +advantage in terms of speed when operating with objects coming from files that +have been created in machines with a byte ordering different from native. + +Besides, all leaf constructors have received a new ``byteorder`` parameter +that allows specifying the byteorder of data on disk. In particular, a +``_v_byteorder`` entry in a Table description is no longer honored and you +should use the aforementioned ``byteorder`` parameter. + + +Tunable internal buffer sizes +============================= + +You can change the size of the internal buffers for I/O purposes of PyTables +by changing the value of the new public attribute ``nrowsinbuf`` that is +present in all leaves. By default, this contains a sensible value so as to +achieve a good balance between speed and memory consumption. Be careful when +changing it, if you don't want to get unwanted results (very slow I/O, huge +memory consumption...). + + +Changes to module names +======================= + +If your application is directly accessing modules under the ``tables`` +package, you need to know that *the names of all modules are now all in +lowercase*. This allows one to tell apart the ``tables.Array`` *class* from +the ``tables.array`` *module* (which was also called ``tables.Array`` before). +This includes subpackages like ``tables.nodes.FileNode``. + +On top of that, more-or-less independent modules have also been renamed and +some of them grouped into subpackages. The most important are: + +- The ``tables.netcdf3`` subpackage replaces the old ``tables.NetCDF`` module. +- The ``tables.nra`` subpackage replaces the old ``nestedrecords.py`` with the + implementation of the ``NestedRecArray`` class. + +Also, the ``tables.misc`` package includes utility modules which do not depend +on PyTables. + + +Other changes +============= + +- ``Filters.complib`` is ``None`` for filter properties created with + ``complevel=0`` (i.e. disabled compression, which is the default). +- 'non-relevant' --> 'irrelevant' (applied to byteorders) +- ``Table.colstypes`` --> ``Table.coltypes`` +- ``Table.coltypes`` --> ``Table.coldtypes`` +- Added ``Table.coldescr``, dictionary of the ``Col`` descriptions. +- ``Table.colshapes`` has disappeared. You can get it this way:: + + colshapes = dict( (name, col.shape) + for (name, col) in table.coldescr.iteritems() ) + +- ``Table.colitemsizes`` has disappeared. You can get it this way:: + + colitemsizes = dict( (name, col.itemsize) + for (name, col) in table.coldescr.iteritems() ) + +- ``Description._v_totalsize`` --> ``Description._v_itemsize`` +- ``Description._v_itemsizes`` and ``Description._v_totalsizes`` have + disappeared. + +- ``Leaf._v_chunksize`` --> ``Leaf.chunkshape`` + + +---- + + **Enjoy data!** + + -- The PyTables Team + + +.. Local Variables: +.. mode: rst +.. coding: utf-8 +.. fill-column: 78 +.. End: diff --git a/doc/source/MIGRATING_TO_3.x.rst b/doc/source/MIGRATING_TO_3.x.rst new file mode 100644 index 0000000..02f5249 --- /dev/null +++ b/doc/source/MIGRATING_TO_3.x.rst @@ -0,0 +1,581 @@ +================================== +Migrating from PyTables 2.x to 3.x +================================== + +:Author: Antonio Valentino +:Author: Anthony Scopatz +:Author: Thomas Provoost + +This document describes the major changes in PyTables in going from the +2.x to 3.x series and what you need to know when migrating downstream +code bases. + +Python 3 at Last! +================= + +The PyTables 3.x series now ships with full compatibility for Python 3.1+. +Additionally, we plan on maintaining compatibility with Python 2.7 for the +foreseeable future. Python 2.6 is no longer supported but +may work in most cases. Note that the entire 3.x series now relies on +numexpr v2.1+, which itself is the first version of numexpr support both +Python 2 & 3. + +Numeric, Numarray, NetCDF3, & HDF5 1.6 No More! +=============================================== + +PyTables no longer supports numeric and numarray. Please use numpy instead. +Additionally, the ``tables.netcdf3`` module has been removed. Please refer +to the `netcdf4-python`_ project for further support. Lastly, the older +HDF5 1.6 API is no longer supported. Please upgrade to HDF5 1.8+. + +Unicode all the strings! +======================== + +In Python 3, all strings are natively in Unicode. This introduces some +difficulties, as the native HDF5 string format is not Unicode-compatible. +To minimize explicit conversion troubles when writing, especially :ref:`when +creating data sets from existing Python objects `, string +objects are implicitly cast to non-Unicode for HDF5 storage. To make you +aware of this, a warning is raised when this happens. + +This is certainly no true Unicode compatibility, but mainly for convenience +with the pure-Unicode Python 3 string type. Any string that is not castable +as ascii upon creation of your data set, will hence still raise an error. +For true Unicode support, look into the ``VLUnicodeAtom`` class. + +Major API Changes +================= + +The PyTables developers, `by popular demand`_, have taken this opportunity +that a major version number upgrade affords to implement significant API +changes. We have tried to do this in such a way that will not immediately +break most existing code, though in some breakages may still occur. + +PEP 8 Compliance +**************** +The PyTables 3.x series now follows `PEP 8`_ coding standard. This makes +using PyTables more idiomatic with surrounding Python code that also adheres +to this standard. The primary way that the 2.x series was *not* PEP 8 +compliant was with respect to variable naming conventions. Approximately +:ref:`450 API variables ` were identified and updated for +PyTables 3.x. + +To ease migration, PyTables ships with a new ``pt2to3`` command line tool. +This tool will run over a file and replace any instances of the old variable +names with the 3.x version of the name. This tool covers the overwhelming +majority of cases was used to transition the PyTables code base itself! However, +it may also accidentally also pick up variable names in 3rd party codes that +have *exactly* the same name as a PyTables' variable. This is because ``pt2to3`` +was implemented using regular expressions rather than a fancier AST-based +method. By using regexes, ``pt2to3`` works on Python and Cython code. + + +``pt2to3`` **help:** + +.. code-block:: bash + + usage: pt2to3 [-h] [-r] [-p] [-o OUTPUT] [-i] filename + + PyTables 2.x -> 3.x API transition tool This tool displays to standard out, so + it is common to pipe this to another file: $ pt2to3 oldfile.py > newfile.py + + positional arguments: + filename path to input file. + + optional arguments: + -h, --help show this help message and exit + -r, --reverse reverts changes, going from 3.x -> 2.x. + -p, --no-ignore-previous + ignores previous_api() calls. + -o OUTPUT output file to write to. + -i, --inplace overwrites the file in-place. + +Note that ``pt2to3`` only works on a single file, not a directory. However, +a simple BASH script may be written to run ``pt2to3`` over an entire directory +and all sub-directories: + +.. code-block:: bash + + #!/bin/bash + for f in $(find .) + do + echo $f + pt2to3 $f > temp.txt + mv temp.txt $f + done + +.. note:: + + :program:`pt2to3` uses the :mod:`argparse` module that is part of the + Python standard library since Python 2.7. + Users of Python 2.6 should install :mod:`argparse` separately + (e.g. via :program:`pip`). + +The old APIs and variable names will continue to be supported for the short term, +where possible. (The major backwards incompatible changes come from the renaming +of some function and method arguments and keyword arguments.) Using the 2.x APIs +in the 3.x series, however, will issue warnings. The following is the release +plan for the warning types: + +* 3.0 - PendingDeprecationWarning +* 3.1 - DeprecationWarning +* >=3.2 - Remove warnings, previous_api(), and _past.py; keep pt2to3, + +The current plan is to maintain the old APIs for at least 2 years, though this +is subject to change. + +.. _create-signatures: + +Consistent ``create_xxx()`` Signatures +*************************************** + +Also by popular demand, it is now possible to create all data sets (``Array``, +``CArray``, ``EArray``, ``VLArray``, and ``Table``) from existing Python objects. +Constructors for these classes now accept either of the following keyword arguments: + +* an ``obj`` to initialize with data +* or both ``atom`` and ``shape`` to initialize an empty structure, if possible. + +These keyword arguments are also now part of the function signature for the +corresponding ``create_xxx()`` methods on the ``File`` class. These would be called +as follows:: + + # All create methods will support the following + create_xxx(where, name, obj=obj) + + # All non-variable length arrays support the following: + create_xxx(where, name, atom=atom, shape=shape) + +Using ``obj`` or ``atom`` and ``shape`` are mutually exclusive. Previously only +``Array`` could be created with an existing Python object using the ``object`` +keyword argument. + + +.. _api-name-changes: + +API Name Changes +**************** + +The following tables shows the old 2.x names that have been update to their +new values in the new 3.x series. Please use the ``pt2to3`` tool to convert +between these. + +================================ ================================ +**2.x Name** **3.x Name** +================================ ================================ +AtomFromHDF5Type atom_from_hdf5_type +AtomToHDF5Type atom_to_hdf5_type +BoolTypeNextAfter bool_type_next_after +HDF5ClassToString hdf5_class_to_string +HDF5ToNPExtType hdf5_to_np_ext_type +HDF5ToNPNestedType hdf5_to_np_nested_type +IObuf iobuf +IObufcpy iobufcpy +IntTypeNextAfter int_type_next_after +NPExtPrefixesToPTKinds npext_prefixes_to_ptkinds +PTSpecialKinds pt_special_kinds +PTTypeToHDF5 pttype_to_hdf5 +StringNextAfter string_next_after +__allowedInitKwArgs __allowed_init_kwargs +__getRootGroup __get_root_group +__next__inKernel __next__inkernel +_actionLogName _action_log_name +_actionLogParent _action_log_parent +_actionLogPath _action_log_path +_addRowsToIndex _add_rows_to_index +_appendZeros _append_zeros +_autoIndex _autoindex +_byteShape _byte_shape +_c_classId _c_classid +_c_shadowNameRE _c_shadow_name_re +_cacheDescriptionData _cache_description_data +_checkAndSetPair _check_and_set_pair +_checkAttributes _check_attributes +_checkBase _checkbase +_checkColumn _check_column +_checkGroup _check_group +_checkNotClosed _check_not_closed +_checkOpen _check_open +_checkShape _check_shape +_checkShapeAppend _check_shape_append +_checkUndoEnabled _check_undo_enabled +_checkWritable _check_writable +_check_sortby_CSI _check_sortby_csi +_closeFile _close_file +_codeToOp _code_to_op +_column__createIndex _column__create_index +_compileCondition _compile_condition +_conditionCache _condition_cache +_convertTime64 _convert_time64 +_convertTime64_ _convert_time64_ +_convertTypes _convert_types +_createArray _create_array +_createCArray _create_carray +_createMark _create_mark +_createPath _create_path +_createTable _create_table +_createTransaction _create_transaction +_createTransactionGroup _create_transaction_group +_disableIndexingInQueries _disable_indexing_in_queries +_doReIndex _do_reindex +_emptyArrayCache _empty_array_cache +_enableIndexingInQueries _enable_indexing_in_queries +_enabledIndexingInQueries _enabled_indexing_in_queries +_exprvarsCache _exprvars_cache +_f_copyChildren _f_copy_children +_f_delAttr _f_delattr +_f_getAttr _f_getattr +_f_getChild _f_get_child +_f_isVisible _f_isvisible +_f_iterNodes _f_iter_nodes +_f_listNodes _f_list_nodes +_f_setAttr _f_setattr +_f_walkGroups _f_walk_groups +_f_walkNodes _f_walknodes +_fancySelection _fancy_selection +_fillCol _fill_col +_flushBufferedRows _flush_buffered_rows +_flushFile _flush_file +_flushModRows _flush_mod_rows +_g_addChildrenNames _g_add_children_names +_g_checkGroup _g_check_group +_g_checkHasChild _g_check_has_child +_g_checkName _g_check_name +_g_checkNotContains _g_check_not_contains +_g_checkOpen _g_check_open +_g_closeDescendents _g_close_descendents +_g_closeGroup _g_close_group +_g_copyAsChild _g_copy_as_child +_g_copyChildren _g_copy_children +_g_copyRows _g_copy_rows +_g_copyRows_optim _g_copy_rows_optim +_g_copyWithStats _g_copy_with_stats +_g_createHardLink _g_create_hard_link +_g_delAndLog _g_del_and_log +_g_delLocation _g_del_location +_g_flushGroup _g_flush_group +_g_getAttr _g_getattr +_g_getChildGroupClass _g_get_child_group_class +_g_getChildLeafClass _g_get_child_leaf_class +_g_getGChildAttr _g_get_gchild_attr +_g_getLChildAttr _g_get_lchild_attr +_g_getLinkClass _g_get_link_class +_g_listAttr _g_list_attr +_g_listGroup _g_list_group +_g_loadChild _g_load_child +_g_logAdd _g_log_add +_g_logCreate _g_log_create +_g_logMove _g_log_move +_g_maybeRemove _g_maybe_remove +_g_moveNode _g_move_node +_g_postInitHook _g_post_init_hook +_g_postReviveHook _g_post_revive_hook +_g_preKillHook _g_pre_kill_hook +_g_propIndexes _g_prop_indexes +_g_readCoords _g_read_coords +_g_readSelection _g_read_selection +_g_readSlice _g_read_slice +_g_readSortedSlice _g_read_sorted_slice +_g_refNode _g_refnode +_g_removeAndLog _g_remove_and_log +_g_setAttr _g_setattr +_g_setLocation _g_set_location +_g_setNestedNamesDescr _g_set_nested_names_descr +_g_setPathNames _g_set_path_names +_g_unrefNode _g_unrefnode +_g_updateDependent _g_update_dependent +_g_updateLocation _g_update_location +_g_updateNodeLocation _g_update_node_location +_g_updateTableLocation _g_update_table_location +_g_widthWarning _g_width_warning +_g_writeCoords _g_write_coords +_g_writeSelection _g_write_selection +_g_writeSlice _g_write_slice +_getColumnInstance _get_column_instance +_getConditionKey _get_condition_key +_getContainer _get_container +_getEnumMap _get_enum_map +_getFileId _get_file_id +_getFinalAction _get_final_action +_getInfo _get_info +_getLinkClass _get_link_class +_getMarkID _get_mark_id +_getNode _get_node +_getOrCreatePath _get_or_create_path +_getTypeColNames _get_type_col_names +_getUnsavedNrows _get_unsaved_nrows +_getValueFromContainer _get_value_from_container +_hiddenNameRE _hidden_name_re +_hiddenPathRE _hidden_path_re +_indexNameOf _index_name_of +_indexNameOf_ _index_name_of_ +_indexPathnameOf _index_pathname_of +_indexPathnameOfColumn _index_pathname_of_column +_indexPathnameOfColumn_ _index_pathname_of_column_ +_indexPathnameOf_ _index_pathname_of_ +_initLoop _init_loop +_initSortedSlice _init_sorted_slice +_isWritable _iswritable +_is_CSI _is_csi +_killNode _killnode +_lineChunkSize _line_chunksize +_lineSeparator _line_separator +_markColumnsAsDirty _mark_columns_as_dirty +_newBuffer _new_buffer +_notReadableError _not_readable_error +_npSizeType _npsizetype +_nxTypeFromNPType _nxtype_from_nptype +_opToCode _op_to_code +_openArray _open_array +_openUnImplemented _open_unimplemented +_pointSelection _point_selection +_processRange _process_range +_processRangeRead _process_range_read +_pythonIdRE _python_id_re +_reIndex _reindex +_readArray _read_array +_readCoordinates _read_coordinates +_readCoords _read_coords +_readIndexSlice _read_index_slice +_readSelection _read_selection +_readSlice _read_slice +_readSortedSlice _read_sorted_slice +_refNode _refnode +_requiredExprVars _required_expr_vars +_reservedIdRE _reserved_id_re +_reviveNode _revivenode +_saveBufferedRows _save_buffered_rows +_searchBin _search_bin +_searchBinNA_b _search_bin_na_b +_searchBinNA_d _search_bin_na_d +_searchBinNA_e _search_bin_na_e +_searchBinNA_f _search_bin_na_f +_searchBinNA_g _search_bin_na_g +_searchBinNA_i _search_bin_na_i +_searchBinNA_ll _search_bin_na_ll +_searchBinNA_s _search_bin_na_s +_searchBinNA_ub _search_bin_na_ub +_searchBinNA_ui _search_bin_na_ui +_searchBinNA_ull _search_bin_na_ull +_searchBinNA_us _search_bin_na_us +_setAttributes _set_attributes +_setColumnIndexing _set_column_indexing +_shadowName _shadow_name +_shadowParent _shadow_parent +_shadowPath _shadow_path +_sizeToShape _size_to_shape +_tableColumnPathnameOfIndex _table_column_pathname_of_index +_tableFile _table_file +_tablePath _table_path +_table__autoIndex _table__autoindex +_table__getautoIndex _table__getautoindex +_table__setautoIndex _table__setautoindex +_table__whereIndexed _table__where_indexed +_transGroupName _trans_group_name +_transGroupParent _trans_group_parent +_transGroupPath _trans_group_path +_transName _trans_name +_transParent _trans_parent +_transPath _trans_path +_transVersion _trans_version +_unrefNode _unrefnode +_updateNodeLocations _update_node_locations +_useIndex _use_index +_vShape _vshape +_vType _vtype +_v__nodeFile _v__nodefile +_v__nodePath _v__nodepath +_v_colObjects _v_colobjects +_v_maxGroupWidth _v_max_group_width +_v_maxTreeDepth _v_maxtreedepth +_v_nestedDescr _v_nested_descr +_v_nestedFormats _v_nested_formats +_v_nestedNames _v_nested_names +_v_objectID _v_objectid +_whereCondition _where_condition +_writeCoords _write_coords +_writeSelection _write_selection +_writeSlice _write_slice +appendLastRow append_last_row +attrFromShadow attr_from_shadow +attrToShadow attr_to_shadow +autoIndex autoindex +bufcoordsData bufcoords_data +calcChunksize calc_chunksize +checkFileAccess check_file_access +checkNameValidity check_name_validity +childName childname +chunkmapData chunkmap_data +classIdDict class_id_dict +className classname +classNameDict class_name_dict +containerRef containerref +convertToNPAtom convert_to_np_atom +convertToNPAtom2 convert_to_np_atom2 +copyChildren copy_children +copyClass copyclass +copyFile copy_file +copyLeaf copy_leaf +copyNode copy_node +copyNodeAttrs copy_node_attrs +countLoggedInstances count_logged_instances +createArray create_array +createCArray create_carray +createCSIndex create_csindex +createEArray create_earray +createExternalLink create_external_link +createGroup create_group +createHardLink create_hard_link +createIndex create_index +createIndexesDescr create_indexes_descr +createIndexesTable create_indexes_table +createNestedType create_nested_type +createSoftLink create_soft_link +createTable create_table +createVLArray create_vlarray +defaultAutoIndex default_auto_index +defaultIndexFilters default_index_filters +delAttr del_attr +delAttrs _del_attrs +delNodeAttr del_node_attr +detectNumberOfCores detect_number_of_cores +disableUndo disable_undo +dumpGroup dump_group +dumpLeaf dump_leaf +dumpLoggedInstances dump_logged_instances +enableUndo enable_undo +enumFromHDF5 enum_from_hdf5 +enumToHDF5 enum_to_hdf5 +fetchLoggedInstances fetch_logged_instances +flushRowsToIndex flush_rows_to_index +getAttr get_attr +getAttrs _get_attrs +getClassByName get_class_by_name +getColsInOrder get_cols_in_order +getCurrentMark get_current_mark +getEnum get_enum +getFilters get_filters +getHDF5Version get_hdf5_version +getIndices get_indices +getLRUbounds get_lru_bounds +getLRUsorted get_lru_sorted +getLookupRange get_lookup_range +getNestedField get_nested_field +getNestedFieldCache get_nested_field_cache +getNestedType get_nested_type +getNode get_node +getNodeAttr get_node_attr +getPyTablesVersion get_pytables_version +getTypeEnum get_type_enum +getWhereList get_where_list +hdf5Extension hdf5extension +hdf5Version hdf5_version +indexChunk indexchunk +indexValid indexvalid +indexValidData index_valid_data +indexValues indexvalues +indexValuesData index_values_data +indexesExtension indexesextension +infType inftype +infinityF infinityf +infinityMap infinitymap +initRead initread +isHDF5File is_hdf5_file +isPyTablesFile is_pytables_file +isUndoEnabled is_undo_enabled +isVisible isvisible +isVisibleName isvisiblename +isVisibleNode is_visible_node +isVisiblePath isvisiblepath +is_CSI is_csi +iterNodes iter_nodes +iterseqMaxElements iterseq_max_elements +joinPath join_path +joinPaths join_paths +linkExtension linkextension +listLoggedInstances list_logged_instances +listNodes list_nodes +loadEnum load_enum +logInstanceCreation log_instance_creation +lrucacheExtension lrucacheextension +metaIsDescription MetaIsDescription +modifyColumn modify_column +modifyColumns modify_columns +modifyCoordinates modify_coordinates +modifyRows modify_rows +moveFromShadow move_from_shadow +moveNode move_node +moveToShadow move_to_shadow +newNode new_node +newSet newset +newdstGroup newdst_group +objectID object_id +oldPathname oldpathname +openFile open_file +openNode open_node +parentNode parentnode +parentPath parentpath +reIndex reindex +reIndexDirty reindex_dirty +readCoordinates read_coordinates +readIndices read_indices +readSlice read_slice +readSorted read_sorted +readWhere read_where +read_sliceLR read_slice_lr +recreateIndexes recreate_indexes +redoAddAttr redo_add_attr +redoCreate redo_create +redoDelAttr redo_del_attr +redoMove redo_move +redoRemove redo_remove +removeIndex remove_index +removeNode remove_node +removeRows remove_rows +renameNode rename_node +rootUEP root_uep +searchLastRow search_last_row +setAttr set_attr +setAttrs _set_attrs +setBloscMaxThreads set_blosc_max_threads +setInputsRange set_inputs_range +setNodeAttr set_node_attr +setOutput set_output +setOutputRange set_output_range +silenceHDF5Messages silence_hdf5_messages +splitPath split_path +tableExtension tableextension +undoAddAttr undo_add_attr +undoCreate undo_create +undoDelAttr undo_del_attr +undoMove undo_move +undoRemove undo_remove +utilsExtension utilsextension +walkGroups walk_groups +walkNodes walk_nodes +whereAppend append_where +whereCond wherecond +whichClass which_class +whichLibVersion which_lib_version +willQueryUseIndexing will_query_use_indexing +================================ ================================ + +---- + + **Enjoy data!** + + -- The PyTables Developers + + +.. Local Variables: +.. mode: rst +.. coding: utf-8 +.. fill-column: 78 +.. End: + + +.. _by popular demand: http://sourceforge.net/mailarchive/message.php?msg_id=29584752 + +.. _PEP 8: http://www.python.org/dev/peps/pep-0008/ + +.. _netcdf4-python: http://code.google.com/p/netcdf4-python/ diff --git a/doc/source/_static/logo-pytables-small.png b/doc/source/_static/logo-pytables-small.png new file mode 100644 index 0000000..b614a7e Binary files /dev/null and b/doc/source/_static/logo-pytables-small.png differ diff --git a/doc/source/_templates/layout.html b/doc/source/_templates/layout.html new file mode 100644 index 0000000..f47665b --- /dev/null +++ b/doc/source/_templates/layout.html @@ -0,0 +1,15 @@ +{% extends "!layout.html" %} + +{%- block extrahead %} +{{ super() }} + + + + +{% endblock %} diff --git a/doc/source/conf.py b/doc/source/conf.py new file mode 100644 index 0000000..ba95716 --- /dev/null +++ b/doc/source/conf.py @@ -0,0 +1,208 @@ +# Configuration file for the Sphinx documentation builder. +# +# This file does only contain a selection of the most common options. For a +# full list see the documentation: +# http://www.sphinx-doc.org/en/master/config + +# -- Path setup -------------------------------------------------------------- + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# +# import os +# import sys +# sys.path.insert(0, os.path.abspath('.')) +from pathlib import Path + + +# -- Project information ----------------------------------------------------- + +project = 'PyTables' +copyright = '2011–2021, PyTables maintainers' +author = 'PyTables maintainers' + +# The short X.Y version +import tables as tb +# from packaging.version import Version +# version = Version(tb.__version__).base_version +version = tb.__version__ + +# The full version, including alpha/beta/rc tags +release = tb.__version__ + + +# -- General configuration --------------------------------------------------- + +# If your documentation needs a minimal Sphinx version, state it here. +needs_sphinx = '1.3' + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + 'sphinx.ext.autodoc', + 'sphinx.ext.autosummary', + 'sphinx.ext.doctest', + 'sphinx.ext.mathjax', + 'sphinx.ext.inheritance_diagram', + 'sphinx.ext.extlinks', + 'sphinx.ext.todo', + 'sphinx.ext.viewcode', + 'IPython.sphinxext.ipython_console_highlighting', + #'numpydoc', + 'sphinx.ext.napoleon', +] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# The suffix(es) of source filenames. +# You can specify multiple suffix as a list of string: +# +# source_suffix = ['.rst', '.md'] +source_suffix = '.rst' + +# The encoding of source files. +source_encoding = 'utf-8' + +# The master toctree document. +master_doc = 'index' + +# The language for content autogenerated by Sphinx. Refer to documentation +# for a list of supported languages. +# +# This is also used if you do content translation via gettext catalogs. +# Usually you set "language" from the command line for these cases. +language = None + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path . +exclude_patterns = [] + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = 'sphinx' + + +# -- Options for HTML output ------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +html_theme = "sphinx_rtd_theme" + +# Theme options are theme-specific and customize the look and feel of a theme +# further. For a list of options available for each theme, see the +# documentation. +html_theme_options = { + # 'sticky_navigation': True # Set to False to disable the sticky nav while scrolling. + 'logo_only': True, # if we have a html_logo below, this shows /only/ the logo with no title text +} + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] + +# Custom sidebar templates, must be a dictionary that maps document names +# to template names. +# +# The default sidebars (for documents that don't match any pattern) are +# defined by theme itself. Builtin themes are using these templates by +# default: ``['localtoc.html', 'relations.html', 'sourcelink.html', +# 'searchbox.html']``. +# +# html_sidebars = {} + +# Add any paths that contain custom themes here, relative to this directory. +import sphinx_rtd_theme +html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] + +# The name of an image file (relative to this directory) to place at the top +# of the sidebar. +html_logo = '_static/logo-pytables-small.png' + +# -- Options for HTMLHelp output --------------------------------------------- + +# Output file base name for HTML help builder. +htmlhelp_basename = 'PyTablesDoc' + + +# -- Options for LaTeX output ------------------------------------------------ + +latex_elements = { + # The paper size ('letterpaper' or 'a4paper'). + # + # 'papersize': 'letterpaper', + + # The font size ('10pt', '11pt' or '12pt'). + # + # 'pointsize': '10pt', + + # Additional stuff for the LaTeX preamble. + 'preamble': r'''\usepackage{bookmark,hyperref} +\usepackage[para]{threeparttable} +\DeclareUnicodeCharacter{210F}{$\hbar$}''', + + # Latex figure (float) alignment + # + # 'figure_align': 'htbp', +} + +# Grouping the document tree into LaTeX files. List of tuples +# (source start file, target name, title, +# author, documentclass [howto, manual, or own class]). +latex_documents = [ + ('usersguide/usersguide', 'usersguide-%s.tex' % version, + 'PyTables User Guide', 'PyTables maintainers', 'manual'), +] + +# The name of an image file (relative to this directory) to place at the top of +# the title page. +latex_logo = 'usersguide/images/pytables-front-logo.pdf' + +# For "manual" documents, if this is true, then toplevel headings are parts, +# not chapters. +latex_use_parts = True + +# If false, no module index is generated. +latex_domain_indices = False + + +# -- Options for Epub output ------------------------------------------------- + +# Bibliographic Dublin Core info. +epub_title = project +epub_author = author +epub_publisher = author +epub_copyright = copyright + +# The unique identifier of the text. This can be a ISBN number +# or the project homepage. +# +# epub_identifier = '' + +# A unique identification for the text. +# +# epub_uid = '' + +# A list of files that should not be packed into the epub file. +epub_exclude_files = ['search.html'] + + +# -- Extension configuration ------------------------------------------------- + +# -- Options for intersphinx extension --------------------------------------- + +# Example configuration for intersphinx: refer to the Python standard library. +intersphinx_mapping = {'https://docs.python.org/': None} + +# -- External link options ---------------------------------------------------- +extlinks = { + 'issue': ('https://github.com/PyTables/PyTables/issues/%s', 'gh-'), +} + +# -- Options for autodocumentation --------------------------------------------- +autodoc_member_order = "groupwise" +autoclass_content = "class" +autosummary_generate = [] diff --git a/doc/source/cookbook/custom_data_types.rst b/doc/source/cookbook/custom_data_types.rst new file mode 100644 index 0000000..9caeb0d --- /dev/null +++ b/doc/source/cookbook/custom_data_types.rst @@ -0,0 +1,99 @@ +:author: KennethArnold +:date: 2009-07-14 21:51:07 + + +================================ +Using your own custom data types +================================ + +You can make your own data types by subclassing Table (or other PyTables types, +such as :class:`tables.Leaf`). +This can be useful for storing a specialized type of data or presenting a +customized API. + +Submitted by Kevin R. Thornton. + +:: + + import numpy as np + + import tables + from tables import File, Table + from tables.file import _checkfilters + + from tables.parameters import EXPECTED_ROWS_TABLE + + + class DerivedFromTable(Table): + _c_classId = 'DerivedFromTable' + + def __init__(self, parentNode, name, description=None, + title="", filters=None, + expectedrows=EXPECTED_ROWS_TABLE, + chunkshape=None, byteorder=None, _log=True): + super().__init__(parentNode, name, + description=description, title=title, + filters=filters, + expectedrows=expectedrows, + chunkshape=chunkshape, byteorder=byteorder, + _log=_log) + + def read(self, start=None, stop=None, step=None, field=None): + print("HERE!") + data = Table.read(self, start=start, stop=stop, step=step, + field=field) + return data + + + def createDerivedFromTable(self, where, name, data, title="", + filters=None, expectedrows=10000, + chunkshape=None, byteorder=None, + createparents=False): + parentNode = self._get_or_create_path(where, createparents) + + _checkfilters(filters) + return DerivedFromTable(parentNode, name, data, + title=title, filters=filters, + expectedrows=expectedrows, + chunkshape=chunkshape, byteorder=byteorder) + + + File.createDerivedFromTable = createDerivedFromTable + + + if __name__ == '__main__': + x = np.random.rand(100).reshape(50,2) + x.dtype = [('x',float), ('y',float)] + h5file = tables.open_file('tester.hdf5', 'w') + mtab = h5file.createDerivedFromTable(h5file.root, 'random', x) + + h5file.flush() + print(type(mtab)) + mtab_read = mtab.read() + h5file.close() + h5file = tables.open_file('tester.hdf5', 'r') + mtab = h5file.root.random + + print(type(mtab)) + mtab_read2 = mtab.read() + print(np.array_equal(mtab_read, mtab_read2)) + + +There is an issue that the DerivedFromTable read function will not be called +when the file is re-opened. The notion that the H5 file contains a derived +object gets lost. The output shows that the read function is only called before +the function is closed: + +:: + + HERE! + + True + Closing remaining open files:tester.hdf5...done + + +I ran into this because I wanted a custom read that returned a more complex +object implemented in C++. Using pybind11, I'm easily able to write to a +Table via a record array. I was hoping that I could read back in, construct the +correct C++-based type, and return it. The example seems to suggest that this +is not possible. diff --git a/doc/source/cookbook/hints_for_sql_users.rst b/doc/source/cookbook/hints_for_sql_users.rst new file mode 100644 index 0000000..866357e --- /dev/null +++ b/doc/source/cookbook/hints_for_sql_users.rst @@ -0,0 +1,723 @@ +:author: valhallasw +:date: 2012-06-18 10:15:15 + +=================== +Hints for SQL users +=================== + +This page is intended to be **a guide to new PyTables for users who are used +to writing SQL code** to access their relational databases. +It will cover the most usual SQL statements. +If you are missing a particular statement or usage example, you can ask at the +`PyTables users' list`_ for it. +If you know some examples yourself, you can also write them here! + +This page is under development: you can come back frequently to check for new +examples. +Also, this is no replacement for the `User's Guide`_; +if you don't read the manual, you'll be missing lots of features not available +in relational databases! + +Examples in Python assume that you have imported the PyTables package like +this:: + + import tables + +.. .. contents:: Table Of Contents + + +Creating a new database +======================= + +RDBMs happen to have several syntaxes for creating a database. +A usual syntax is:: + + CREATE DATABASE database_name + +In PyTables, each database goes to a different HDF5_ file (much like +SQLite_ or MS Access). +To create a new HDF5_ file, you use the :func:`tables.open_file` function with +the ``'w'`` mode (which deletes the database if it already exists), like this:: + + h5f = tables.open_file('database_name.h5', 'w') + +In this way you get the ``h5f`` PyTables file handle (an instance of the +:class:`tables.File` class), which is a concept similar to a *database +connection*, and a new :file:`database_name.h5` file is created in the current +directory (you can use full paths here). +You can close the handle (like you close the connection) with:: + + h5f.close() + +This is important for PyTables to dump pending changes to the database. +In case you forget to do it, PyTables closes all open database handles for +you when you exit your program or interactive session, but it is always safer +to close your files explicitly. +If you want to use the database after closing it, you just call +:func:`open_file` again, but using the ``'r+'`` or ``'r'`` modes, depending on +whether you do or don't need to modify the database, respectively. + +You may use several PyTables databases simultaneously in a program, so you +must be explicit on which database you want to act upon (by using its handle). + +A note on concurrency under PyTables +------------------------------------ + +Unlike most RDBMs, PyTables is not intended to serve concurrent accesses to a +database. +It has no protections whatsoever against corruption for different (or even the +same) programs accessing the same database. +Opening several handles to the same database in read-only mode is safe, though. + + +Creating a table +================ + +PyTables supports some other *datasets* besides tables, and they're not +arranged in a flat namespace, but rather into a *hierarchical* one (see an +introduction to the _ref:`object tree `); +however, due to the nature of these recipes, we'll limit ourselves to tables +in the *root group*. +The basic syntax for table creation under SQL is:: + + CREATE TABLE table_name ( + column_name1 column_type1, + column_name2 column_type2, + ... + column_nameN column_typeN + ) + + +Table descriptions +------------------ + +In PyTables, one first *describes* the structure of a table. +PyTables allows you to *reuse a description* for creating several tables with +the same structure, just by using the description object (``description_name`` +below) or getting it from a created table. +This is specially useful for creating temporary tables holding query results. + +You can create a table description using a dictionary:: + + description_name = { + 'column_name1': column_type1, + 'column_name2': column_type2, + 'column_name3': column_type3, + ... + 'column_nameN': column_typeN + } + +or a subclass of :class:`tables.IsDescription`:: + + class description_name(tables.IsDescription): + column_name1 = column_type1 + column_name2 = column_type2 + column_name3 = column_type3 + ... + column_nameN = column_typeN + +Please note that dictionaries are the only way of describing structures with +names which cannot be Python identifiers. +Also, if an explicit order is desired for columns, it must be specified through +the column type declarations (see below), since dictionary keys and class +attributes aren't ordered. +Otherwise, columns are ordered in alphabetic increasing order. +It is important to note that PyTables doesn't have a concept of primary or +foreign keys, so relationships between tables are left to the user. + + +Column type declarations +------------------------ + +PyTables supports lots of types (including nested and multidimensional +columns). +Non-nested columns are declared through instances of :class:`tables.Col` +subclasses (which you can also reuse). +These are some correspondences with SQL: + +==================== ========================== +SQL type declaration PyTables type declaration +==================== ========================== +INTEGER(digits) tables.IntCol(itemsize) +REAL tables.FloatCol() +VARCHAR(length) tables.StringCol(itemsize) +DATE tables.Time32Col() +TIMESTAMP tables.Time64Col() +==================== ========================== + +See a complete description of :ref:`PyTables types `. +Note that some types admit different *item sizes*, which are specified in +bytes. +For types with a limited set of supported item sizes, you may also use specific +subclasses which are named after the type and its *precision*, e.g. ``Int32Col`` +for 4-byte (32 bit) item size. + +Cells in a PyTables' table always have a value of the cell type, so there is +no ``NULL``. +Instead, cells take a *default value* (zero or empty) which can be changed in +the type declaration, like this: ``col_name = StringCol(10, dflt='nothing')`` +(``col_name`` takes the value ``'nothing'`` if unset). +The declaration also allows you to set *column order* via the ``pos`` argument, +like this:: + + class ParticleDescription(tables.IsDescription): + name = tables.StringCol(10, pos=1) + x = tables.FloatCol(pos=2) + y = tables.FloatCol(pos=3) + temperature = tables.FloatCol(pos=4) + + +Using a description +=================== + +Once you have a table description ``description_name`` and a writeable file +handle ``h5f``, creating a table with that description is as easy as:: + + tbl = h5f.create_table('/', 'table_name', description_name) + +PyTables is very object-oriented, and database is usually done through +methods of :class:`tables.File`. +The first argument indicates the *path* where the table will be created, +i.e. the root path (HDF5 uses Unix-like paths). +The :meth:`tables.File.create_table` method has many options e.g. for setting +a table title or compression properties. What you get back is an instance of +:class:`tables.Table`, a handle for accessing the data in that table. + +As with files, table handles can also be closed with ``tbl.close()``. +If you want to access an already created table, you can use:: + + tbl = h5f.get_node('/', 'table_name') + +(PyTables uses the concept of *node* for datasets -tables and others- and +groups in the object tree) or, using *natural naming*:: + + tbl = h5f.root.table_name + +Once you have created a table, you can access (and reuse) its description by +accessing the ``description`` attribute of its handle. + + +Creating an index +================= + +RDBMs use to allow named indexes on any set of columns (or all of them) in a +table, using a syntax like:: + + CREATE INDEX index_name + ON table_name (column_name1, column_name2, column_name3...) + +and + + DROP INDEX index_name + +Indexing is supported in the versions of PyTables >= 2.3 (and in PyTablesPro). +However, indexes don't have names and they are bound to single columns. +Following the object-oriented philosophy of PyTables, index creation is a +method (:meth:`tables.Column.create_index`) of a :class:`tables.Column` object +of a table, which you can access trough its ``cols`` accessor. + +:: + tbl.cols.column_name.create_index() + +For dropping an index on a column:: + + tbl.cols.column_name.remove_index() + + +Altering a table +================ + +The first case of table alteration is renaming:: + + ALTER TABLE old_name RENAME TO new_name + +This is accomplished in !PyTables with:: + + h5f.rename_node('/', name='old_name', newname='new_name') + +or through the table handle:: + + tbl.rename('new_name') + +A handle to a table is still usable after renaming. +The second alteration, namely column addition, is currently not supported in +PyTables. + + +Dropping a table +================ + +In SQL you can remove a table using:: + + DROP TABLE table_name + +In PyTables, tables are removed as other nodes, using the +:meth:`tables.File.remove_node` method:: + + h5f.remove_node('/', 'table_name') + +or through the table handle:: + + tbl.remove() + +When you remove a table, its associated indexes are automatically removed. + + +Inserting data +============== + +In SQL you can insert data one row at a time (fetching from a selection will +be covered later) using a syntax like:: + + INSERT INTO table_name (column_name1, column_name2...) + VALUES (value1, value2...) + +In PyTables, rows in a table form a *sequence*, so data isn't *inserted* into +a set, but rather *appended* to the end of the sequence. +This also implies that identical rows may exist in a table (but they have a +different *row number*). +There are two ways of appending rows: one at a time or in a block. +The first one is conceptually similar to the SQL case:: + + tbl.row['column_name1'] = value1 + tbl.row['column_name2'] = value2 + ... + tbl.row.append() + +The ``tbl.row`` accessor represents a *new row* in the table. +You just set the values you want to set (the others take the default value +from their column declarations - see above) and the effectively append the +new row. +This code is usually enclosed in some kind of loop, like:: + + row = tbl.row + while some_condition: + row['column_name1'] = value1 + ... + row.append() + +For appending a block of rows in a single shot, :meth:`tables.Table.append` +is more adequate. +You just pass a NumPy_ record array or Python sequence with elements which +match the expected columns. +For example, given the ``tbl`` handle for a table with the ``ParticleDescription`` +structure described above:: + + rows = [ + ('foo', 0.0, 0.0, 150.0), + ('bar', 0.5, 0.0, 100.0), + ('foo', 1.0, 1.0, 25.0) + ] + tbl.append(rows) + + # Using a NumPy container. + import numpy + rows = numpy.rec.array(rows) + tbl.append(rows) + + +A note on transactions +---------------------- + +PyTables doesn't support transactions nor checkpointing or rolling back (there +is undo support for operations performed on the object tree, but this is +unrelated). +Changes to the database are optimised for maximum performance and reasonable +memory requirements, which means that you can't tell whether e.g. +``tbl.append()`` has actually committed all, some or no data to disk when it ends. + +However, you can *force* PyTables to commit changes to disk using the ``flush()`` +method of table and file handles:: + + tbl.flush() # flush data in the table + h5f.flush() # flush all pending data + +Closing a table or a database actually flushes it, but it is recommended that +you explicitly flush frequently (specially with tables). + + +Updating data +============= + +We're now looking for alternatives to the SQL ``UPDATE`` statement:: + + UPDATE table_name + SET column_name1 = expression1, column_name2 = expression2... + [WHERE condition] + +There are different ways of approaching this, depending on your needs. +If you aren't using a condition, then the ``SET`` clause updates all rows, +something you can do in PyTables by iterating over the table:: + + for row in tbl: + row['column_name1'] = expression1 + row['column_name2'] = expression2 + ... + row.update() + +Don't forget to call ``update()`` or no value will be changed! +Also, since the used iterator allows you to read values from the current row, +you can implement a simple *conditional update*, like this:: + + for row in tbl: + if condition on row['column_name1'], row['column_name2']...: + row['column_name1'] = expression1 + row['column_name2'] = expression2 + ... + row.update() + +There are substantially more efficient ways of locating rows fulfilling a +condition. +Given the main PyTables usage scenarios, querying and modifying data are +quite decoupled operations, so we will have a look at querying later and +assume that you already know the set of rows you want to update. + +If the set happens to be a slice of the table, you may use the +:`meth:`tables.Table.modify_rows` method or its equivalent +:meth:`tables.Table.__setitem__` notation:: + + rows = [ + ('foo', 0.0, 0.0, 150.0), + ('bar', 0.5, 0.0, 100.0), + ('foo', 1.0, 1.0, 25.0) + ] + tbl.modifyRows(start=6, stop=13, step=3, rows=rows) + tbl[6:13:3] = rows # this is the same + +If you just want to update some columns in the slice, use the +:meth:`tables.Table.modify_columns` or :meth:`tables.Table.modify_column` +methods:: + + cols = [ + [150.0, 100.0, 25.0] + ] + # These are all equivalent. + tbl.modify_columns(start=6, stop=13, step=3, columns=cols, names=['temperature']) + tbl.modify_column(start=6, stop=13, step=3, column=cols[0], colname='temperature') + tbl.cols.temperature[6:13:3] = cols[0] + +The last line shows an example of using the ``cols`` accessor to get to the +desired :class:`tables.Column` of the table using natural naming and apply +``setitem`` on it. + +If the set happens to be an array of sparse coordinates, you can also use +PyTables' extended slice notation:: + + rows = [ + ('foo', 0.0, 0.0, 150.0), + ('bar', 0.5, 0.0, 100.0), + ('foo', 1.0, 1.0, 25.0) + ] + rownos = [2, 735, 371913476] + tbl[rownos] = rows + + +instead of the traditional:: + + for row_id, datum in zip(rownos, rows): + tbl[row_id] = datum + +Since you are modifying table data in all cases, you should also remember to +``flush()`` the table when you're done. + + +Deleting data +============= + +Rows are deleted from a table with the following SQL syntax:: + + DELETE FROM table_name + [WHERE condition] + +:meth:`tables.Table.remove_rows` is the method used for deleting rows in +PyTables. +However, it is very simple (only contiguous blocks of rows can be deleted) and +quite inefficient, and one should consider whether *dumping filtered data from +one table into another* isn't a much more convenient approach. +This is a far more optimized operation under PyTables which will be covered +later. + +Anyway, using ``remove_row()`` or ``remove_rows()`` is quite straightforward:: + + tbl.remove_row(12) # delete one single row (12) + tbl.remove_rows(12, 20) # delete all rows from 12 to 19 (included) + tbl.remove_rows(0, tbl.nrows) # delete all rows unconditionally + tbl.remove_rows(-4, tbl.nrows) # delete the last 4 rows + + +Reading data +============ + +The most basic syntax in SQL for reading rows in a table without using a +condition is:: + + SELECT (column_name1, column_name2... | *) FROM table_name + +Which reads all rows (though maybe not all columns) from a table. +In PyTables there are two ways of retrieving data: *iteratively* or *at once*. +You'll notice some similarities with how we appended and updated data above, +since this dichotomy is widespread here. + +For a clearer separation with conditional queries (covered further below), +and since the concept of *row number* doesn't exist in relational databases, +we'll be including here the cases where you want to read a **known** *slice* +or *sequence* of rows, besides the case of reading *all* rows. + + +Iterating over rows +------------------- + +This is similar to using the ``fetchone()`` method of a DB ``cursor`` in a +`Python DBAPI`_-compliant package, i.e. you *iterate* over the list of wanted +rows, getting one *row handle* at a time. +In this case, the handle is an instance of the :class:`tables.Row` class, +which allows access to individual columns as items accessed by key (so there +is no special way of selecting columns: you just use the ones you want +whenever you want). + +This way of reading rows is recommended when you want to perform operations +on individual rows in a simple manner, and specially if you want to process +a lot of rows in the table (i.e. when loading them all at once would take too +much memory). +Iterators are also handy for using with the ``itertools`` Python module for +grouping, sorting and other operations. + +For iterating over *all* rows, use plain iteration or the +:meth:`tables.Table.iterrows` method:: + + for row in tbl: # or tbl.iterrows() + do something with row['column_name1'], row['column_name2']... + +For iterating over a *slice* of rows, use the +:meth:`tables.Table.iterrows|Table.iterrows` method:: + + for row in tbl.iterrows(start=6, stop=13, step=3): + do something with row['column_name1'], row['column_name2']... + +For iterating over a *sequence* of rows, use the +:meth:`tables.Table.itersequence` method:: + + for row in tbl.itersequence([6, 7, 9, 11]): + do something with row['column_name1'], row['column_name2']... + +Reading rows at once +-------------------- + +In contrast with iteration, you can fetch all desired rows into a single +*container* in memory (usually an efficient NumPy_ record-array) in a single +operation, like the ``fetchall()`` or ``fetchmany()`` methods of a DBAPI ``cursor``. +This is specially useful when you want to transfer the read data to another +component in your program, avoiding loops to construct your own containers. +However, you should be careful about the amount of data you are fetching into +memory, since it can be quite large (and even exceed its physical capacity). + +You can choose between the ``Table.read*()`` methods or the +:meth:`tables.Table.__getitem__` syntax for this kind of reads. +The ``read*()`` methods offer you the chance to choose a single column to read +via their ``field`` argument (which isn't still as powerful as the SQL ``SELECT`` +column spec). + +For reading *all* rows, use ``[:]`` or the :meth:`tables.Table.read` method:: + + rows = tbl.read() + rows = tbl[:] # equivalent + +For reading a *slice* of rows, use ``[slice]`` or the +:meth:`tables.Table.read|Table.read` method:: + + rows = tbl.read(start=6, stop=13, step=3) + rows = tbl[6:13:3] # equivalent + +For reading a *sequence* of rows, use the :meth:`tables.Table.read_coordinates` +method:: + + rows = tbl.read_coordinates([6, 7, 9, 11]) + +Please note that you can add a ``field='column_name'`` argument to ``read*()`` +methods in order to get only the given column instead of them all. + + +Selecting data +============== + +When you want to read a subset of rows which match a given condition from a +table you use a syntax like this in SQL:: + + SELECT column_specification FROM table_name + WHERE condition + +The ``condition`` is an expression yielding a boolean value based on a +combination of column names and constants with functions and operators. +If the condition holds true for a given row, the ``column_specification`` is +applied on it and the resulting row is added to the result. + +In PyTables, you may filter rows using two approaches: the first one is +achieved through standard Python comparisons (similar to what we used for +conditional update), like this:: + + for row in tbl: + if condition on row['column_name1'], row['column_name2']...: + do something with row + +This is easy for newcomers, but not very efficient. That's why PyTables offers +another approach: **in-kernel** searches, which are much more efficient than +standard searches, and can take advantage of indexing (under PyTables >= 2.3). + +In-kernel searches are used through the *where methods* in ``Table``, which are +passed a *condition string* describing the condition in a Python-like syntax. +For instance, with the ``ParticleDescription`` we defined above, we may specify +a condition for selecting particles at most 1 unit apart from the origin with +a temperature under 100 with a condition string like this one:: + + '(sqrt(x**2 + y**2) <= 1) & (temperature < 100)' + +Where ``x``, ``y`` and ``temperature`` are the names of columns in the table. +The operators and functions you may use in a condition string are described +in the :ref:`appendix on condition syntax ` in the +`User's Guide`_. + + +Iterating over selected rows +---------------------------- + +You can iterate over the rows in a table which fulfill a condition (a la DBAPI +``fetchone()``) by using the :meth:`tables.Table.where` method, which is very +similar to the :meth:`tables.Table.iterrows` one discussed above, and which +can be used in the same circumstances (i.e. performing operations on individual +rows or having results exceeding available memory). + +Here is an example of using ``where()`` with the previous example condition:: + + for row in tbl.where('(sqrt(x**2 + y**2) <= 1) & (temperature < 100)'): + do something with row['name'], row['x']... + + +Reading selected rows at once +----------------------------- + +Like the aforementioned :meth:`tables.Table.read`, +:meth:`tables.Table.read_where` gets all the rows fulfilling the given +condition and packs them in a single container (a la DBAPI ``fetchmany()``). +The same warning applies: be careful on how many rows you expect to retrieve, +or you may run out of memory! + +Here is an example of using ``read_where()`` with the previous example +condition:: + + rows = tbl.read_where('(sqrt(x**2 + y**2) <= 1) & (temperature < 100)') + +Please note that both :meth:`tables.Table.where` and +:meth:`tables.Table.read_where` can also take slicing arguments. + + +Getting the coordinates of selected rows +---------------------------------------- + +There is yet another method for querying tables: +:meth:`tables.Table.get_where_list`. +It returns just a sequence of the numbers of the rows which fulfil the given +condition. +You may pass that sequence to :meth:`tables.Table.read_coordinates`, e.g. to +retrieve data from a different table where rows with the same number as the +queried one refer to the same first-class object or entity. + + +A note on table joins +--------------------- + +You may have noticed that queries in PyTables only cover one table. +In fact, there is no way of directly performing a join between two tables in +PyTables (remember that it's not a relational database). +You may however work around this limitation depending on your case: + +* If one table is an *extension* of another (i.e. it contains additional + columns for the same entities), your best bet is to arrange rows of the + same entity so that they are placed in the same positions in both tables. + For instance, if ``tbl1`` and ``tbl2`` follow this rule, you may do something + like this to emulate a natural join:: + + for row1 in tbl1.where('condition'): + row2 = tbl2[row1.nrow] + if condition on row2['column_name1'], row2['column_name2']...: + do something with row1 and row2... + + (Note that ``row1`` is a ``Row`` instance and ``row2`` is a record of the current + flavor.) + +* If rows in both tables are linked by a common value (e.g. acting as an + identifier), you'll need to split your condition in one for the first table + and one for the second table, and then nest your queries, placing the most + restrictive one first. For instance:: + + SELECT clients.name, bills.item_id FROM clients, bills + WHERE clients.id = bills.client_id and clients.age > 50 and bills.price > 200 + + could be written as:: + + for client in clients.where('age > 50'): + # Note that the following query is different for each client. + for bill in bills.where('(client_id == %r) & (price > 200)' % client['id']): + do something with client['name'] and bill['item_id'] + + In this example, indexing the ``client_id`` column of ``bills`` could speed up + the inner query quite a lot. + Also, you could avoid parsing the inner condition each time by using + *condition variables*:: + + for client in clients.where('age > 50'): + for bill in bills.where('(client_id == cid) & (price > 200)', {'cid': client['id']}): + do something with client['name'] and bill['item_id'] + + +Summary of row selection methods +================================ + ++----------------------+-----------------+---------------------+-----------------------+-------------------------+ +| | **All rows** | **Range of rows** | **Sequence of rows** | **Condition** | ++----------------------+-----------------+---------------------+-----------------------+-------------------------+ +| **Iterative access** | ``__iter__()``, | ``iterrows(range)`` | ``itersequence()`` | ``where(condition)`` | +| | ``iterrows()`` | | | | ++----------------------+-----------------+---------------------+-----------------------+-------------------------+ +| **Block access** | ``[:]``, | ``[range]``, | ``readCoordinates()`` |``read_where(condition)``| +| | ``read()`` | ``read(range)`` | | | ++----------------------+-----------------+---------------------+-----------------------+-------------------------+ + + +Sorting the results of a selection +================================== + +*Do you feel like writing this section? Your contribution is welcome!* + + +Grouping the results of a selection +=================================== + +By making use of the :func:`itertools.groupby` utility, you can group results +by field:: + + group = {} # dictionary to put results grouped by 'pressure' + def pressure_selector(row): + return row['pressure'] + for pressure, rows_grouped_by_pressure in itertools.groupby(mytable, pressure_selector): + group[pressure] = sum((r['energy'] + r['ADCcount'] for r in rows_grouped_by_pressure)) + +However, :func:`itertools.groupby` assumes the incoming array is sorted by the +grouping field. +If not, there are multiple groups with the same grouper returned. +In the example, mytable thus has to be sorted on pressure, or the last line +should be changed to:: + + group[pressure] += sum((r['energy'] + r['ADCcount'] for r in rows_grouped_by_pressure)) + + +----- + + +.. target-notes:: + +.. _`PyTables users' list`: https://lists.sourceforge.net/lists/listinfo/pytables-users +.. _`User's Guide`: https://www.pytables.org/usersguide +.. _HDF5: http://www.hdfgroup.org/HDF5 +.. _SQLite: http://www.sqlite.org +.. _NumPy: http://www.numpy.org +.. _`Python DBAPI`: http://www.python.org/dev/peps/pep-0249 diff --git a/doc/source/cookbook/index.rst b/doc/source/cookbook/index.rst new file mode 100644 index 0000000..c9a9c45 --- /dev/null +++ b/doc/source/cookbook/index.rst @@ -0,0 +1,19 @@ +================= +PyTables Cookbook +================= + +-------- +Contents +-------- + +.. toctree:: + :maxdepth: 1 + + hints_for_sql_users + PyTables & py2exe Howto (by Tommy Edvardsen) + How to install PyTables when you're not root (by Koen van de Sande) + tailoring_atexit_hooks + custom_data_types + simple_table + inmemory_hdf5_files + threading diff --git a/doc/source/cookbook/inmemory_hdf5_files.rst b/doc/source/cookbook/inmemory_hdf5_files.rst new file mode 100644 index 0000000..9797338 --- /dev/null +++ b/doc/source/cookbook/inmemory_hdf5_files.rst @@ -0,0 +1,140 @@ +==================== +In-memory HDF5 files +==================== + +The HDF5 library provides functions to allow an application to work with a +file in memory for faster reads and writes. File contents are kept in memory +until the file is closed. At closing, the memory version of the file can be +written back to disk or abandoned. + + +Open an existing file in memory +=============================== + +Assuming the :file:`sample.h5` exists in the current folder, it is possible to +open it in memory simply using the CORE driver at opening time. + +The HDF5 driver that one intend to use to open/create a file can be specified +using the *driver* keyword argument of the :func:`tables.open_file` function:: + + >>> import tables + >>> h5file = tables.open_file("sample.h5", driver="H5FD_CORE") + +The content of the :file`sample.h5` is opened for reading. It is loaded into +memory and all reading operations are performed without disk I/O overhead. + +.. note:: + + the initial loading of the entire file into memory can be time expensive + depending on the size of the opened file and on the performances of the + disk subsystem. + +.. seealso:: + + general information about HDF5 drivers can be found in the `Alternate + File Storage Layouts and Low-level File Drivers`__ section of the `HDF5 + User's Guide`_. + +__ `HDF5 drivers`_ + + +Creating a new file in memory +============================= + +Creating a new file in memory is as simple as creating a regular file, just +one needs to specify to use the CORE driver:: + + >>> import tables + >>> h5file = tables.open_file("new_sample.h5", "w", driver="H5FD_CORE") + >>> import numpy + >>> a = h5file.create_array(h5file.root, "array", numpy.zeros((300, 300))) + >>> h5file.close() + + +Backing store +============= + +In the previous example contents of the in-memory `h5file` are automatically +saved to disk when the file descriptor is closed, so a new +:file:`new_sample.h5` file is created and all data are transferred to disk. + +Again this can be time a time expensive action depending on the amount of +data in the HDF5 file and depending on how fast the disk I/O is. + +Saving data to disk is the default behavior for the CORE driver in PyTables. + +This feature can be controlled using the *driver_core_backing_store* +parameter of the :func:`tables.open_file` function. Setting it to `False` +disables the backing store feature and all changes in the working `h5file` +are lost after closing:: + + >>> h5file = tables.open_file("new_sample.h5", "w", driver="H5FD_CORE", + ... driver_core_backing_store=0) + +Please note that the *driver_core_backing_store* disables saving of data, not +loading. +In the following example the :file:`sample.h5` file is opened in-memory in +append mode. All data in the existing :file:`sample.h5` file are loaded into +memory and contents can be actually modified by the user:: + + >>> import tables + >>> h5file = tables.open_file("sample.h5", "a", driver="H5FD_CORE", + driver_core_backing_store=0) + >>> import numpy + >>> h5file.create_array(h5file.root, "new_array", numpy.arange(20), + title="New array") + >>> array2 = h5file.root.array2 + >>> print(array2) + /array2 (Array(20,)) 'New array' + >>> h5file.close() + +Modifications are lost when the `h5file` descriptor is closed. + + +Memory images of HDF5 files +=========================== + +It is possible to get a memory image of an HDF5 file (see +`HDF5 File Image Operations`_). This feature is only available if PyTables +is build against version 1.8.9 or newer of the HDF5 library. + +In particular getting a memory image of an HDF5 file is possible only if the +file has been opened with one of the following drivers: SEC2 (the default +one), STDIO or CORE. + +An example of how to get an image:: + + >>> import tables + >>> h5file = tables.open_file("sample.h5") + >>> image = h5file.get_file_image() + >>> h5file.close() + +The memory ìmage of the :file:`sample.h5` file is copied into the `ìmage` +string (of bytes). + +.. note:: + + the `ìmage` string contains all data stored in the HDF5 file so, of + course, it can be quite large. + +The `ìmage` string can be passed around and can also be used to initialize a +new HDF5 file descriptor:: + + >>> import tables + >>> h5file = tables.open_file("in-memory-sample.h5", driver="H5FD_CORE", + driver_core_image=image, + driver_core_backing_store=0) + >>> print(h5file.root.array) + /array (Array(300, 300)) 'Array' + >>> h5file.setNodeAttr(h5file.root, "description", "In memory file example") + + + +----- + + +.. target-notes:: + +.. _`HDF5 drivers`: http://www.hdfgroup.org/HDF5/doc/UG/08_TheFile.html#Drivers +.. _`HDF5 User's Guide`: https://portal.hdfgroup.org/display/HDF5/HDF5+User+Guides +.. _`HDF5 File Image Operations`: http://www.hdfgroup.org/HDF5/doc/Advanced/FileImageOperations/HDF5FileImageOperations.pdf diff --git a/doc/source/cookbook/no_root_install.rst b/doc/source/cookbook/no_root_install.rst new file mode 100644 index 0000000..69ac22b --- /dev/null +++ b/doc/source/cookbook/no_root_install.rst @@ -0,0 +1,172 @@ +:author: localhost +:date: 2008-04-21 11:12:44 + +.. todo:: update to use new SW versions + + +Installing PyTables when you're not root +======================================== + +By `Koen van de Sande `_. + +.. warning:: contents of this recipe may be outdated. + +This guide describes how to install PyTables and its dependencies on Linux or +other \*nix systems when your user account is not root. +Installing the HDF5_ shared libraries and Python extension +NumPy requires some non-trivial steps to work. +We describe all steps needed. +They only assumption is that you have Python 3.6 or higher and a C/C++ +compiler (gcc) installed. + + +Installing HDF5 +--------------- + +* First go to or make a temporary folder where we can download and compile + software. + We'll assume you're in this temporary folder in the rest of this section. +* Download `hdf5-1.12.1.tar.gz` from https://www.hdfgroup.org/downloads/hdf5 +* Extract the archive to the current folder:: + + tar xzvf hdf5-1.12.1.tar.gz + +* Go to the extracted HDF5 folder:: + + cd hdf5-1.12.1 + +* Run the configure script:: + + ./configure + +* Run make:: + + make install + +* We've now compiled HDF5_ into the `hdf5` folder inside the source tree. + We'll need to move this to its final location. + For this guide, we'll make a `software` folder inside your home directory + to store installed libraries:: + + mkdir ~/software + +* Move the files to the right location:: + + mv hdf5 ~/software/ + + +Installing NumPy +---------------- + +* From the `NumPy page on PyPI `_ + download NumPy 1.21.5 (at time of writing) to our temporary folder. +* Extract the archive:: + + tar xzvf numpy-1.21.5.tar.gz + +* Go to the NumPy folder:: + + cd numpy-1.21.5 +* Build and install the Python module into our software folder:: + + python3 setup.py install --home=~/software + + +Python wrapper script +--------------------- + +We've installed all dependencies of PyTables. +We need to create a wrapper script for Python to let PyTables actually find +all these dependencies. +Had we installed them as root, they'd be trivial to find, but now we need to +help a bit. + +* Create a script with the following contents (I've called this script `p` on + my machine):: + + #!/bin/bash + export PYTHONPATH=~/software/lib/python + export HDF5_DIR=~/software/hdf5 + export LD_LIBRARY_PATH=~/software/lib/python/tables:~/software/hdf5/lib + python3 $* + +* Make the script executable:: + + chmod 755 p + +* Place the script somewhere on your path (for example, inside a folder + called `bin` inside your home dir, which is normally added to the path + automatically). + If you do not add this script to your path, you'll have to replace `p` in + scripts below by the full path (and name of) your script, e.g. + `~/pytablespython.sh` if you called it `pytablespython.sh` and put it in + your home dir. +* Test your Python wrapper script:: + + p + +* It should now start Python. And you should be able to import `numpy` + without errors:: + + >>> import numpy + + +.. note:: + + you could do this differently by defining these environment settings + somewhere in your startup scripts, but this wrapper script approach is + cleaner. + + +Installing PyTables +------------------- + +* From the `PyPI page `_ + download PyTables 3.7.0 (at time of writing) to our temporary folder. +* Extract the archive:: + + tar xzvf pytables-3.7.0.tar.gz + +* Go to the PyTables folder:: + + cd pytables-3.7.0 + +* Install PyTables using our wrapper script:: + + p setup.py install --home=~/software + + +Running Python with PyTables support +------------------------------------ + +* Use your Python wrapper script to start Python:: + + p + +* You can now import `tables` without errors:: + + >>> import tables + >>> tables.__version__ + '3.7.0' + + +Concluding remarks +------------------ + +* It is safe to remove the temporary folder we have used in this guide, + there are no dependencies on it. +* This guide was written for and tested with HDF5 1.12.1, PyTables 3.7.6 and + Numpy 1.21.5. + + +Enjoy working with PyTables! + +*Koen* + + +----- + + +.. target-notes:: + +.. _HDF5: http://www.hdfgroup.org/HDF5 diff --git a/doc/source/cookbook/py2exe_howto.rst b/doc/source/cookbook/py2exe_howto.rst new file mode 100644 index 0000000..c06995b --- /dev/null +++ b/doc/source/cookbook/py2exe_howto.rst @@ -0,0 +1,82 @@ +:author: localhost +:date: 2008-04-21 11:12:45 + +.. todo:: update the code example to numpy + +============================================================= +How to integrate PyTables in your application by using py2exe +============================================================= + +This document shortly describes how to build an executable when using PyTables. +Py2exe_ is a third party product that converts python scripts into standalone +windows application/programs. +For more information about py2exe please visit http://www.py2exe.org. + +To be able to use py2exe you have to download and install it. +Please follow the instructions at http://www.py2exe.org. + +Let’s assume that you have written a python script as in the attachment +:download:`py2exe_howto/pytables_test.py` + +.. literalinclude:: py2exe_howto/pytables_test.py + :linenos: + +To wrap this script into an executable you have to create a setup script and a +configuration script in your program directory. + +The setup script will look like this:: + + from setuptools import setup + import py2exe + setup(console=['pytables_test.py']) + +The configuration script (:file:`setup.cfg`) specifies which modules to be +included and excluded:: + + [py2exe] + excludes= Tkconstants,Tkinter,tcl + includes= encodings.*, tables.*, numpy.* + +As you can see I have included everything from tables (tables.*) and numpy +(numpy.*). + +Now you are ready to build the executable file (:file:`pytable_test.exe`). +During the build process a subfolder called *dist* will be created. +This folder contains everything needed for your program. +All dependencies (dll's and such stuff) will be copied into this folder. +When you distribute your application you have to distribute all files and +folders inside the *dist* folder. + +Below you can see how to start the build process (`python setup.py py2exe`):: + + c:pytables_test> python3 setup.py py2exe + ... + BUILDING EXECUTABLE + ... + +After the build process I enter the *dist* folder and start +:file:`pytables_test.exe`. + +:: + + c:pytables_test> cd dist + + c:pytables_testdist> pytables_test.exe + tutorial.h5 (File) 'Test file' + Last modif.: 'Tue Apr 04 23:09:17 2006' + Object Tree: + / (RootGroup) 'Test file' + /detector (Group) 'Detector information' + /detector/readout (Table(0,)) 'Readout example' + + [25.0, 36.0, 49.0] + +DONE! + + +----- + + +.. target-notes:: + +.. _py2exe: http://www.py2exe.org diff --git a/doc/source/cookbook/py2exe_howto/pytables_test.py b/doc/source/cookbook/py2exe_howto/pytables_test.py new file mode 100644 index 0000000..ce34e1e --- /dev/null +++ b/doc/source/cookbook/py2exe_howto/pytables_test.py @@ -0,0 +1,42 @@ +import tables as tb + + +class Particle(tb.IsDescription): + name = tb.StringCol(16) # 16-character String + idnumber = tb.Int64Col() # Signed 64-bit integer + ADCcount = tb.UInt16Col() # Unsigned short integer + TDCcount = tb.UInt8Col() # Unsigned byte + grid_i = tb.Int32Col() # Integer + grid_j = tb.IntCol() # Integer (equivalent to Int32Col) + pressure = tb.Float32Col() # Float (single-precision) + energy = tb.FloatCol() # Double (double-precision) + + +with tb.open_file("tutorial.h5", mode="w", title="Test file") as h5file: + group = h5file.create_group("/", "detector", "Detector information") + table = h5file.create_table(group, "readout", Particle, "Readout example") + + print(h5file) + + particle = table.row + + for i in range(10): + particle['name'] = f'Particle: {i:6d}' + particle['TDCcount'] = i % 256 + particle['ADCcount'] = (i * 256) % (1 << 16) + particle['grid_i'] = i + particle['grid_j'] = 10 - i + particle['pressure'] = float(i * i) + particle['energy'] = float(particle['pressure'] ** 4) + particle['idnumber'] = i * (2 ** 34) + particle.append() + + table.flush() + +with tb.open_file("tutorial.h5", mode="r", title="Test file") as h5file: + table = h5file.root.detector.readout + pressure = [x['pressure'] + for x in table.iterrows() + if x['TDCcount'] > 3 and 20 <= x['pressure'] < 50] + + print(pressure) diff --git a/doc/source/cookbook/simple_table.rst b/doc/source/cookbook/simple_table.rst new file mode 100644 index 0000000..4504143 --- /dev/null +++ b/doc/source/cookbook/simple_table.rst @@ -0,0 +1,132 @@ +:author: FrancescAlted +:date: 2010-04-20 16:44:41 + + +=================================================== +SimpleTable: simple wrapper around the Table object +=================================================== + +Here it is yet another example on how to inherit from the :class:`tables.Table` +object so as to build an easy-to-use Table object. +Thanks to Brent Pedersen for this one (taken from +https://pypi.python.org/pypi/simpletable). + +:: + + """ + + SimpleTable: simple wrapper around pytables hdf5 + ------------------------------------------------------------------------------ + + Example Usage:: + + >>> from simpletable import SimpleTable + >>> import tables + + # define the table as a subclass of simple table. + >>> class ATable(SimpleTable): + ... x = tables.Float32Col() + ... y = tables.Float32Col() + ... name = tables.StringCol(16) + + # instantiate with: args: filename, tablename + >>> tbl = ATable('test_docs.h5', 'atable1') + + # insert as with pytables: + >>> row = tbl.row + >>> for i in range(50): + ... row['x'], row['y'] = i, i * 10 + ... row['name'] = "name_%i" % i + ... row.append() + >>> tbl.flush() + + # there is also insert_many() method() with takes an iterable + # of dicts with keys matching the colunns (x, y, name) in this + # case. + + # query the data (query() alias of tables' readWhere() + >>> tbl.query('(x > 4) & (y < 70)') #doctest: +NORMALIZE_WHITESPACE + array([('name_5', 5.0, 50.0), ('name_6', 6.0, 60.0)], + dtype=[('name', '|S16'), ('x', ' 0 + + if verbose and are_open_files: + sys.stderr.write("Closing remaining open files:") + + if Version(tables.__version__) >= Version("3.1.0"): + # make a copy of the open_files.handlers container for the iteration + handlers = list(open_files.handlers) + else: + # for older versions of pytables, setup the handlers list from the + # keys + keys = open_files.keys() + handlers = [] + for key in keys: + handlers.append(open_files[key]) + + for fileh in handlers: + if verbose: + sys.stderr.write("%s..." % fileh.filename) + + fileh.close() + + if verbose: + sys.stderr.write("done") + + if verbose and are_open_files: + sys.stderr.write("\n") + + import sys, atexit + atexit.register(my_close_open_files, False) + +then, you won't get the closing messages anymore because the new registered +function is executed before the existing one. +If you want the messages back again, just set the verbose parameter to true. + +You can also use the `atexit` hooks to perform other cleanup functions as well. + diff --git a/doc/source/cookbook/threading.rst b/doc/source/cookbook/threading.rst new file mode 100644 index 0000000..ca55903 --- /dev/null +++ b/doc/source/cookbook/threading.rst @@ -0,0 +1,274 @@ +========= +Threading +========= + +.. py:currentmodule:: tables + + +Background +========== + +Several bug reports have been filed in the past by the users regarding +problems related to the impossibility to use PyTables in multi-thread +programs. + +The problem was mainly related to an internal registry that forced the +sharing of HDF5 file handles across multiple threads. + +In PyTables 3.1.0 the code for file handles management has been completely +redesigned (see the *Backward incompatible changes* section in +:doc:`../release-notes/RELEASE_NOTES_v3.1.x`) to be more simple and +transparent and to allow the use of PyTables in multi-thread programs. + +Citing the :doc:`../release-notes/RELEASE_NOTES_v3.1.x`:: + + It is important to stress that the new implementation still has an + internal registry (implementation detail) and it is still + **not thread safe**. + Just now a smart enough developer should be able to use PyTables in a + muti-thread program without too much headaches. + + +A common schema for concurrency +=============================== + +Although it is probably not the most efficient or elegant solution to solve +a certain class of problems, many users seems to like the possibility to +load a portion of data and process it inside a *thread function* using +multiple threads to process the entire dataset. + +Each thread is responsible of: + +* opening the (same) HDF5 file for reading, +* load data from it and +* close the HDF5 file itself + +Each file handle is of exclusive use of the thread that opened it and +file handles are never shared across threads. + +In order to do it in a safe way with PyTables some care should be used +during the phase of opening and closing HDF5 files in order ensure the +correct behaviour of the internal machinery used to manage HDF5 file handles. + + +Very simple solution +==================== + +A very simple solution for this kind of scenario is to use a +:class:`threading.Lock` around part of the code that are considered critical +e.g. the :func:`open_file` function and the :meth:`File.close` method:: + + import threading + + lock = threading.Lock() + + def synchronized_open_file(*args, **kwargs): + with lock: + return tb.open_file(*args, **kwargs) + + def synchronized_close_file(self, *args, **kwargs): + with lock: + return self.close(*args, **kwargs) + + +The :func:`synchronized_open_file` and :func:`synchronized_close_file` can +be used in the *thread function* to open and close the HDF5 file:: + + import numpy as np + import tables as tb + + def run(filename, path, inqueue, outqueue): + try: + yslice = inqueue.get() + h5file = synchronized_open_file(filename, mode='r') + h5array = h5file.get_node(path) + data = h5array[yslice, ...] + psum = np.sum(data) + except Exception as e: + outqueue.put(e) + else: + outqueue.put(psum) + finally: + synchronized_close_file(h5file) + + +Finally the main function of the program: + +* instantiates the input and output :class:`queue.Queue`, +* starts all threads, +* sends the processing requests on the input :class:`queue.Queue` +* collects results reading from the output :class:`queue.Queue` +* performs finalization actions (:meth:`threading.Thread.join`) + +.. code-block:: python + + import os + import queue + import threading + + import numpy as np + import tables as tb + + SIZE = 100 + NTHREADS = 5 + FILENAME = 'simple_threading.h5' + H5PATH = '/array' + + def create_test_file(filename): + data = np.random.rand(SIZE, SIZE) + + with tb.open_file(filename, 'w') as h5file: + h5file.create_array('/', 'array', title="Test Array", obj=data) + + def chunk_generator(data_size, nchunks): + chunk_size = int(np.ceil(data_size / nchunks)) + for start in range(0, data_size, chunk_size): + yield slice(start, start + chunk_size) + + def main(): + # generate the test data + if not os.path.exists(FILENAME): + create_test_file(FILENAME) + + threads = [] + inqueue = queue.Queue() + outqueue = queue.Queue() + + # start all threads + for i in range(NTHREADS): + thread = threading.Thread( + target=run, args=(FILENAME, H5PATH, inqueue, outqueue)) + thread.start() + threads.append(thread) + + # push requests in the input queue + for yslice in chunk_generator(SIZE, len(threads)): + inqueue.put(yslice) + + # collect results + try: + mean_ = 0. + + for i in range(len(threads)): + out = outqueue.get() + if isinstance(out, Exception): + raise out + else: + mean_ += out + + mean_ /= SIZE * SIZE + + finally: + for thread in threads: + thread.join() + + # print results + print('Mean: {}'.format(mean_)) + + if __name__ == '__main__': + main() + +The program in the example computes the mean value of a potentially huge +dataset splinting the computation across :data:`NTHREADS` (5 in this case) +threads. + +The complete and working code of this example (Python 3 is required) can be +found in the :file:`examples` directory: +:download:`simple_threading.py <../../../examples/simple_threading.py>`. + +The approach presented in this section is very simple and readable but has +the **drawback** that the user code have to be modified to replace +:func:`open_file` and :meth:`File.close` calls with their safe version +(:func:`synchronized_open_file` and :func:`synchronized_close_file`). + +Also, the solution shown in the example does not cover the entire PyTables +API (e.g. although not recommended HDF5 files can be opened using the +:class:`File` constructor) and makes it impossible to use *pythonic* +constructs like the *with* statement:: + + with tb.open_file(filename) as h5file: + do_something(h5file) + + +Monkey-patching PyTables +======================== + +An alternative implementation with respect to the `Very simple solution`_ +presented in the previous section consists in monkey-patching the PyTables +package to replace some of its components with a more thread-safe version of +themselves:: + + import threading + + import tables as tb + import tables.file as _tables_file + + class ThreadsafeFileRegistry(_tables_file._FileRegistry): + lock = threading.RLock() + + @property + def handlers(self): + return self._handlers.copy() + + def add(self, handler): + with self.lock: + return super().add(handler) + + def remove(self, handler): + with self.lock: + return super().remove(handler) + + def close_all(self): + with self.lock: + return super().close_all(handler) + + class ThreadsafeFile(_tables_file.File): + def __init__(self, *args, **kargs): + with ThreadsafeFileRegistry.lock: + super().__init__(*args, **kargs) + + def close(self): + with ThreadsafeFileRegistry.lock: + super().close() + + @functools.wraps(tb.open_file) + def synchronized_open_file(*args, **kwargs): + with ThreadsafeFileRegistry.lock: + return _tables_file._original_open_file(*args, **kwargs) + + # monkey patch the tables package + _tables_file._original_open_file = _tables_file.open_file + _tables_file.open_file = synchronized_open_file + tb.open_file = synchronized_open_file + + _tables_file._original_File = _tables_file.File + _tables_file.File = ThreadsafeFile + tb.File = ThreadsafeFile + + _tables_file._open_files = ThreadsafeFileRegistry() + + +At this point PyTables can be used transparently in the example program presented +in the previous section. +In particular the standard PyTables API (including *with* statements) can be +used in the *thread function*:: + + def run(filename, path, inqueue, outqueue): + try: + yslice = inqueue.get() + with tb.open_file(filename, mode='r') as h5file: + h5array = h5file.get_node(path) + data = h5array[yslice, ...] + psum = np.sum(data) + except Exception as e: + outqueue.put(e) + else: + outqueue.put(psum) + + +The complete code of this version of the example can be found in the +:file:`examples` folder: +:download:`simple_threading.py <../../../examples/threading_monkeypatch.py>`. +Python 3 is required. + + diff --git a/doc/source/dev_team.rst b/doc/source/dev_team.rst new file mode 100644 index 0000000..4c5860a --- /dev/null +++ b/doc/source/dev_team.rst @@ -0,0 +1,25 @@ +======================== +PyTables Governance Team +======================== + +The PyTables team includes: + +* Francesc Alted +* Ivan Vilata +* Scott Prater +* Vicent Mas +* Tom Hedley +* `Antonio Valentino`_ +* Jeffrey Whitaker +* `Josh Moore`_ +* `Anthony Scopatz`_ +* `Andrea Bedini`_ +* `Tom Kooij`_ +* `Javier Sancho`_ + +.. _Anthony Scopatz: https://github.com/scopatz +.. _Antonio Valentino: https://github.com/avalentino +.. _Josh Moore: https://github.com/joshmoore +.. _Andrea Bedini: https://github.com/andreabedini +.. _Tom Kooij: https://github.com/tomkooij +.. _Javier Sancho: https://en.jsancho.org/ diff --git a/doc/source/development.rst b/doc/source/development.rst new file mode 100644 index 0000000..7eeed90 --- /dev/null +++ b/doc/source/development.rst @@ -0,0 +1,35 @@ +==================== +PyTables Development +==================== + +If you want to follow the development of PyTables and take part in it, +you may have a look at the PyTables project pages on +`GitHub `_. + +The source code for PyTables may be found at the `GitHub project site`_. +You can get a copy of the latest version of the source code (under +development) from the master branch of the project repository using git:: + + git clone --recursive git@github.com:PyTables/PyTables.git + +Also, be sure to subscribe to the `Users' Mailing List`_ and/or the +`Developers' Mailing List`_. + +.. _`GitHub project site`: https://github.com/PyTables +.. _`Users' Mailing List`: https://groups.google.com/group/pytables-users +.. _`Developers' Mailing List`: https://groups.google.com/group/pytables-dev + +Other resources for developers: + +* `GitHub project site`_ +* :ref:`library_reference` +* `Git Repository browser `_ +* `Issue tracker `_ +* `Developers wiki `_ +* `Users' Mailing List`_ +* `Developers' Mailing List`_ +* Continuous Integration: + + - `GitHub Actions (GHA) `_ + +.. todo:: improve this section diff --git a/doc/source/downloads.rst b/doc/source/downloads.rst new file mode 100644 index 0000000..4f02c28 --- /dev/null +++ b/doc/source/downloads.rst @@ -0,0 +1,54 @@ +========= +Downloads +========= + +Stable Versions +--------------- + +The stable versions of PyTables can be downloaded from the file `download +area`_ on SourceForge.net. The full distribution contains a copy of this +documentation in HTML. The documentation in both HTML and PDF formats can +also be downloaded separately from the same URL. + +A *pure source* version of the package (mainly intended for developers and +packagers) is available on the `tags page`_ on GitHub. It contains all files +under SCM but not the (generated) files, HTML doc and *cythonized* C +extensions, so it is smaller that the standard package (about 3.5MB). + +Windows binaries can be obtained from many different distributions, like +`Python(x,y)`_, ActiveState_, or Enthought_. +In addition, Christoph Gohlke normally does an excellent job by providing +binaries for many interesting software on his +`website `_. + +You may be interested to install the latest released stable version:: + + $ pip install tables + +Or, you may prefer to install the stable version in Git repository +using :program:`pip`. For example, for the stable 3.1 series, you can do:: + + $ pip install --install-option='--prefix=' \ + -e git+https://github.com/PyTables/PyTables.git@v.3.1#egg=tables + +.. _`download area`: http://sourceforge.net/projects/pytables/files/pytables +.. _`tags page`: https://github.com/PyTables/PyTables/tags +.. _`Python(x,y)`: http://code.google.com/p/pythonxy +.. _ActiveState: http://www.activestate.com/activepython +.. _Enthought: https://www.enthought.com/products/epd + + +Bleeding Edge Versions +---------------------- + +The latest, coolest, and possibly buggiest ;-) sources can be obtained from +the new github repository: + +https://github.com/PyTables/PyTables + +A +`snapshot `_ +of the code in development is also available on the `GitHub project page`_. + +.. _`GitHub project page`: https://github.com/PyTables/PyTables + diff --git a/doc/source/images/NumFocusSponsoredStamp.png b/doc/source/images/NumFocusSponsoredStamp.png new file mode 100644 index 0000000..b8d8e6c Binary files /dev/null and b/doc/source/images/NumFocusSponsoredStamp.png differ diff --git a/doc/source/images/favicon.ico b/doc/source/images/favicon.ico new file mode 100644 index 0000000..650d85b Binary files /dev/null and b/doc/source/images/favicon.ico differ diff --git a/doc/source/images/pytables-logo-notext.svg b/doc/source/images/pytables-logo-notext.svg new file mode 100644 index 0000000..7bc451b --- /dev/null +++ b/doc/source/images/pytables-logo-notext.svg @@ -0,0 +1,160 @@ + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/doc/source/images/pytables-logo.svg b/doc/source/images/pytables-logo.svg new file mode 100644 index 0000000..8fe0acc --- /dev/null +++ b/doc/source/images/pytables-logo.svg @@ -0,0 +1,162 @@ + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/doc/source/index.rst b/doc/source/index.rst new file mode 100644 index 0000000..8afc67b --- /dev/null +++ b/doc/source/index.rst @@ -0,0 +1,68 @@ +=================================== +Welcome to PyTables' documentation! +=================================== + +PyTables is a package for managing hierarchical datasets and designed +to efficiently and easily cope with extremely large amounts of data. +You can download PyTables and use it for free. You can access documentation, +some examples of use and presentations here. + +PyTables is built on top of the HDF5 library, using the Python language +and the NumPy package. It features an object-oriented interface that, +combined with C extensions for the performance-critical parts of the +code (generated using Cython), makes it a fast, yet extremely easy to +use tool for interactively browsing, processing and searching very large +amounts of data. One important feature of PyTables is that it optimizes +memory and disk resources so that data takes much less space (specially +if on-flight compression is used) than other solutions such as relational +object oriented databases. + +You can also find more information by reading the PyTables :doc:`FAQ`. + +PyTables development is a continuing effort and we are always looking for +more developers, testers, and users. If you are interested in being +involved with this project, please contact us via `github`_ or the +`mailing list`_. + +.. image:: images/NumFocusSponsoredStamp.png + :alt: NumFocus Sponsored Stamp + :align: center + :width: 300 + :target: http://www.numfocus.org + +Since August 2015, PyTables is a `NumFOCUS project`_, which means that +your donations are fiscally sponsored under the NumFOCUS umbrella. Please +consider `donating to NumFOCUS`_. + + +-------- +Contents +-------- + +.. toctree:: + :maxdepth: 1 + + User’s Guide + Cookbook + FAQ + other_material + Migrating from 2.x to 3.x + downloads + Release Notes + project_pointers + Development + Development Team + + +============= +Helpful Links +============= + +* :ref:`genindex` +* :ref:`search` + + +.. _github: https://github.com/PyTables/PyTables +.. _`mailing list`: https://groups.google.com/group/pytables-users +.. _`NumFOCUS project`: http://www.numfocus.org/open-source-projects.html +.. _`donating to NumFOCUS`: https://numfocus.salsalabs.org/donate-to-pytables/index.html diff --git a/doc/source/other_material.rst b/doc/source/other_material.rst new file mode 100644 index 0000000..4034daa --- /dev/null +++ b/doc/source/other_material.rst @@ -0,0 +1,89 @@ +============== +Other Material +============== + +Videos +====== + +These are the videos of a series dedicated to introduce the main features of +PyTables in a visual and easy to grasp manner. +More videos will be made available with the time: + +* `HDF5 is for Lovers, SciPy 2012 Tutorial `_: + a beginner's introduction to PyTables and HDF5. + + +Presentations +============= + +Here are the slides of some presentations about PyTables that you may find +useful: + +* HDF5 is for Lovers, SciPy 2012 Tutorial, July 2012, Austin, TX, USA, + `slides (pdf) `_, + `video `_, + `exercises `_, + `solutions `_, and + `repository `_. +* `An on-disk binary data container `_. + Talk given at the `Austin Python Meetup `_, + Austin, TX, USA (May 2012). +* `Large Data Analysis with Python `_. + Seminar given at the `German Neuroinformatics Node `_, + Munich, Germany (November 2010). +* `Starving CPUs (and coping with that in PyTables) + `_. + Seminar given at `FOM Institute for Plasma Physics Rijnhuizen `_, + The Netherlands (September 2009). +* `On The Data Access Issue (or Why Modern CPUs Are Starving) + `_. + Keynote presented at `EuroSciPy 2009 `_ conference + in Leipzig, Germany (July 2009). +* `An Overview of Future Improvements to OPSI + `_. + Informal talk given at the `THG headquarters `_ in + Urbana-Champaign, Illinois, USA (October 2007). +* `Finding Needles in a Huge DataStack + `_. + Talk given at the **EuroPython 2006 Conference**, held at CERN, Genève, + Switzerland (July 2006). +* `Presentation given at the "HDF Workshop 2005" + `_, held at San Francisco, + USA (December 2005). +* `I `_ and + `II `_ **Workshop in Free + Software and Scientific Computing** given at the Universitat Jaume I, + Castelló, Spain (October 2004). In Catalan. +* `Presentation given at the "SciPy Workshop 2004" + `_, held at Caltech, Pasadena, + USA (September 2004). +* `Slides `_ of presentation + given at **EuroPython Conference** in Charleroi, Belgium (June 2003). +* `Presentation for the "iParty5" `_ + held at Castelló, Spain (May 2003). In Spanish. +* `Talk `_ on PyTables given at + the **PyCon 2003 Convention** held at Washington, USA (March 203). + + +Reports +======= + +* White Paper on `OPSI indexes `_, + explaining the powerful new indexing engine in PyTables Pro. +* `Performance study `_ + on how the new object tree cache introduced in PyTables 1.2 can accelerate + the opening of files with a large number of objects, while being quite less + memory hungry. +* `Paper version `_ of the + presentation at PyCon2003. + + + +Other sources for examples +========================== + +The examples presented above show just a little amount of the full capabilities +of PyTables. +Please check out the documentation and the :file:`examples/` directory in the +source package for more examples. + diff --git a/doc/source/project_pointers.rst b/doc/source/project_pointers.rst new file mode 100644 index 0000000..ad92219 --- /dev/null +++ b/doc/source/project_pointers.rst @@ -0,0 +1,21 @@ +================ +Project pointers +================ + +* `Project Home Page `_ +* `GitHub Project Page `_ +* `Online HTML Documentation `_ +* `Download area `_ +* `Git Repository browser `_ +* `Users Mailing List `_ +* `Announce Mailing List `_ +* `Developers Mailing List `_ +* Continuous Integration: + + - `GitHub Actions (GHA) `_ + +* `Project page on PyPi `_ +* `Project Page on SourceForge.net `_ + (needs update) +* `Project page on Launchpad `_ + (going to be closed) diff --git a/doc/source/release-notes/RELEASE_NOTES_v0.7.1.rst b/doc/source/release-notes/RELEASE_NOTES_v0.7.1.rst new file mode 100644 index 0000000..14f7380 --- /dev/null +++ b/doc/source/release-notes/RELEASE_NOTES_v0.7.1.rst @@ -0,0 +1,33 @@ +PyTables 0.7.1 is out! +---------------------- + +This is a mainly a bug-fixing release, where the next problems has +been addressed: + +- Fixed several memory leaks. After that, the memory + consumption when using large object trees has dropped + sensibly. However, there remains some small leaks, but + hopefully they are not very important unless you use *huge* + object trees. + +- Fixed a bug that make the __getitem__ special method in + table to fail when the stop parameter in a extended slice + was not specified. That is, table[10:] now correctly returns + table[10:table.nrows+1], and not table[10:11]. + +- The removeRows() method in Table did not update the NROWS + attribute in Table objects, giving place to errors after + doing further updating operations (removing or adding more + rows) in the same table. This has been fixed now. + +Apart of these fixes, a new lazy reading algorithm for attributes has +been activated by default. With that, the opening of objects with +large hierarchies has been improved by 60% (you can obtain another +additional 10% if using python 2.3 instead of python 2.2). The +documentation has been updated as well, specially a more detailed +instructions on the compression (zlib) libraries installation. + +Also, a stress test has been conducted in order to see if PyTables can +*really* work not only with large data tables, but also with large +object trees. In it, it has been generated and checked a file with +more than 1 TB of size and more than 100 thousand tables on it!. diff --git a/doc/source/release-notes/RELEASE_NOTES_v0.7.2.rst b/doc/source/release-notes/RELEASE_NOTES_v0.7.2.rst new file mode 100644 index 0000000..48e7d0b --- /dev/null +++ b/doc/source/release-notes/RELEASE_NOTES_v0.7.2.rst @@ -0,0 +1,35 @@ +What's new in PyTables 0.7.2 +---------------------------- + +This is a mainly a maintenance release, where the next issues has +been addressed: + +- Fixed a nasty memory leak located on the C libraries (It was + occurring during attribute writes). After that, the memory + consumption when using large object trees has dropped quite + a bit. However, there remains some small leaks that has been + tracked down to the underlying numarray library. These leaks + has been reported, and hopefully they should be fixed more + sooner than later. + +- Table buffers are built dinamically now, so if Tables are + not accessed for reading or writing this memory will not be + booked. This will help to reduce the memory consumption. + +- The opening of files with lots of nodes has been optimized + between a factor 2 and 3. For example, a file with 10 groups + and 3000 tables that takes 9.3 seconds to open in 0.7.1, now + takes only 2.8 seconds. + +- The Table.read() method has been refactored and optimized + and some parts of its code has been moved to Pyrex. In + particular, in the special case of step=1, up to a factor 5 + of speedup (reaching 160 MB/s on a Pentium4 @ 2 GHz) when + reading table contents can be achieved now. + + +Enjoy!, + +-- Francesc Alted +falted@openlc.org + diff --git a/doc/source/release-notes/RELEASE_NOTES_v0.8.rst b/doc/source/release-notes/RELEASE_NOTES_v0.8.rst new file mode 100644 index 0000000..9b9aa2c --- /dev/null +++ b/doc/source/release-notes/RELEASE_NOTES_v0.8.rst @@ -0,0 +1,151 @@ +What's new in PyTables 0.8 +---------------------------- + +On this release, many enhancements has been added and some bugs has +been fixed. Here is the (non-exhaustive) list: + +- The new VLArray class enables you to store large lists of rows + containing variable numbers of elements. The elements can + be scalars or fully multidimensional objects, in the PyTables + tradition. This class supports two special objects as rows: + Unicode strings (UTF-8 codification is used internally) and + generic Python objects (through the use of cPickle). + +- The new EArray class allows you to enlarge already existing + multidimensional homogeneous data objects. Consider it + an extension of the already existing Array class, but + with more functionality. Online compression or other filters + can be applied to EArray instances, for example. + + Another nice feature of EA's is their support for fully + multidimensional data selection with extended slices. You + can write "earray[1,2:3,...,4:200]", for example, to get the + desired dataset slice from the disk. This is implemented + using the powerful selection capabilities of the HDF5 + library, which results in very highly efficient I/O + operations. The same functionality has been added to Array + objects as well. + +- New UnImplemented class. If a dataset contains unsupported + datatypes, it will be associated with an UnImplemented + instance, then inserted into to the object tree as usual. + This allows you to continue to work with supported objects + while retaining access to attributes of unsupported + datasets. This has changed from previous versions, where a + RuntimeError occurred when an unsupported object was + encountered. + + The combination of the new UnImplemented class with the + support for new datatypes will enable PyTables to greatly + increase the number of types of native HDF5 files that can + be read and modified. + +- Boolean support has been added for all the Leaf objects. + +- The Table class has now an append() method that allows you + to save large buffers of data in one go (i.e. bypassing the + Row accessor). This can greatly improve data gathering + speed. + +- The standard HDF5 shuffle filter (to further enhance the + compression level) is supported. + +- The standard HDF5 fletcher32 checksum filter is supported. + +- As the supported number of filters is growing (and may be + further increased in the future), a Filters() class has been + introduced to handle filters more easily. In order to add + support for this class, it was necessary to make a change in + the createTable() method that is not backwards compatible: + the "compress" and "complib" parameters are deprecated now + and the "filters" parameter should be used in their + place. You will be able to continue using the old parameters + (only a Deprecation warning will be issued) for the next few + releases, but you should migrate to the new version as soon + as possible. In general, you can easily migrate old code by + substituting code in its place:: + + table = fileh.createTable(group, 'table', Test, '', complevel, complib) + + should be replaced by:: + + table = fileh.createTable(group, 'table', Test, '', + Filters(complevel, complib)) + +- A copy() method that supports slicing and modification of + filtering capabilities has been added for all the Leaf + objects. See the User's Manual for more information. + +- A couple of new methods, namely copyFile() and copyChilds(), + have been added to File class, to permit easy replication + of complete hierarchies or sub-hierarchies, even to + other files. You can change filters during the copy + process as well. + +- Two new utilities has been added: ptdump and + ptrepack. The utility ptdump allows the user to examine + the contents of PyTables files (both metadata and actual + data). The powerful ptrepack utility lets you + selectively copy (portions of) hierarchies to specific + locations in other files. It can be also used as an + importer for generic HDF5 files. + +- The meaning of the stop parameter in read() methods has + changed. Now a value of 'None' means the last row, and a + value of 0 (zero) means the first row. This is more + consistent with the range() function in python and the + __getitem__() special method in numarray. + +- The method Table.removeRows() is no longer limited by table + size. You can now delete rows regardless of the size of the + table. + +- The "numarray" value has been added to the flavor parameter + in the Table.read() method for completeness. + +- The attributes (.attr instance variable) are Python + properties now. Access to their values is no longer + lazy, i.e. you will be able to see both system or user + attributes from the command line using the tab-completion + capability of your python console (if enabled). + +- Documentation has been greatly improved to explain all the + new functionality. In particular, the internal format of + PyTables is now fully described. You can now build + "native" PyTables files using any generic HDF5 software + by just duplicating their format. + +- Many new tests have been added, not only to check new + functionality but also to more stringently check + existing functionality. There are more than 800 different + tests now (and the number is increasing :). + +- PyTables has a new record in the data size that fits in one + single file: more than 5 TB (yeah, more than 5000 GB), that + accounts for 11 GB compressed, has been created on an AMD + Opteron machine running Linux-64 (the 64 bits version of the + Linux kernel). See the gory details in: + http://pytables.sf.net/html/HowFast.html. + +- New platforms supported: PyTables has been compiled and tested + under Linux32 (Intel), Linux64 (AMD Opteron and Alpha), Win32 + (Intel), MacOSX (PowerPC), FreeBSD (Intel), Solaris (6, 7, 8 + and 9 with UltraSparc), IRIX64 (IRIX 6.5 with R12000) and it + probably works in many more architectures. In particular, + release 0.8 is the first one that provides a relatively clean + porting to 64-bit platforms. + +- As always, some bugs have been solved (especially bugs that + occur when deleting and/or overwriting attributes). + +- And last, but definitely not least, a new donations section + has been added to the PyTables web site + (http://sourceforge.net/projects/pytables, then follow the + "Donations" tag). If you like PyTables and want this effort + to continue, please, donate! + +Enjoy!, + +-- Francesc Alted +falted@pytables.org + diff --git a/doc/source/release-notes/RELEASE_NOTES_v0.9.1.rst b/doc/source/release-notes/RELEASE_NOTES_v0.9.1.rst new file mode 100644 index 0000000..d6cd5fb --- /dev/null +++ b/doc/source/release-notes/RELEASE_NOTES_v0.9.1.rst @@ -0,0 +1,69 @@ +What's new in PyTables 0.9.1 +---------------------------- + +This release is mainly a maintenance version. In it, some bugs has +been fixed and a few improvements has been made. One important thing +is that chunk sizes in EArrays has been re-tuned to get much better +performance. Besides, it has been tested against the latest Python 2.4 +and all unit tests seems to pass fine. + +More in detail: + +Improvements: + +- The chunksize computation for EArrays has been re-tuned to allow the + compression rations that were usual before 0.9 release. + +- New --unpackshort and --quantize flags has been added to nctoh5 + script. --unpackshort unpack short integer variables to float + variables using scale_factor and add_offset netCDF variable + attributes. --quantize quantize data to improve compression using + least_significant_digit netCDF variable attribute (not active by + default). See https://www.ogc.org/standards/netcdf + for further explanation of what this attribute means. Thanks to Jeff + Whitaker for providing this. + +- Table.itersequence has received a new parameter called "sort". This + allows to disable the sorting of the sequence in case the user wants + so. + +Backward-incompatible changes: + +- Now, the AttributeSet class throw an AttributeError on __getattr__ + for nonexistent attributes in it. Formerly, the routine returned + None, which is pretty much against convention in Python and breaks + the built-in hasattr() function. Thanks to Robert Nemec for noting + this and offering a patch. + +- VLArray.read() has changed its behaviour. Now, it always returns a + list, as stated in documentation, even when the number of elements + to return is 0 or 1. This is much more consistent when representing + the actual number of elements on a certain VLArray row. + +API additions: + +- A Row.getTable() has been added. It is an accessor for the associated + Table object. + +- A File.copyAttrs() has been added. It allows copying attributes from + one leaf to other. Properly speaking, this was already there, but not + documented :-/ + +Bug fixes: + +- Now, the copy of hierarchies works even when there are scalar Arrays + (i.e. Arrays which shape is ()) on it. Thanks to Robert Nemec for + providing a patch. + +- Solved a memory leak regarding the Filters instance associated with + the File object, that was not released after closing the file. Now, + there are no known leaks on PyTables itself. + +- Improved security of nodes name checking. Closes #1074335 + + +Enjoy data!, + +-- Francesc Altet +falted@pytables.org + diff --git a/doc/source/release-notes/RELEASE_NOTES_v0.9.rst b/doc/source/release-notes/RELEASE_NOTES_v0.9.rst new file mode 100644 index 0000000..28411a3 --- /dev/null +++ b/doc/source/release-notes/RELEASE_NOTES_v0.9.rst @@ -0,0 +1,120 @@ +What's new in PyTables 0.9 +========================== + +On this release you will find a series of quite +exciting new features, being the most important the indexing +capabilities, in-kernel selections, support for complex datatypes and +the possibility to modify values in both tables *and* arrays (yeah, +finally :). + +New features: +------------- + +- Indexing of columns in tables. That allow to make data selections on + tables up to 500 times faster than standard selections (for + ex. doing a selection along an indexed column of 100 million of rows + takes less than 1 second on a modern CPU). Perhaps the most + interesting thing about the indexing algorithm implemented by + PyTables is that the time taken to index grows *linearly* with the + length of the data, so, making the indexation process to be + *scalable* (quite differently to many relational databases). This + means that it can index, in a relatively quick way, arbitrarily + large table columns (for ex. indexing a column of 100 million of rows + takes just 100 seconds, i.e. at a rate of 1 Mrow/sec). See more + detailed info about that in http://www.pytables.org/docs/SciPy04.pdf. + +- In-kernel selections. This feature allow to make data selections on + tables up to 5 times faster than standard selections (i.e. pre-0.9 + selections), without a need to create an index. As a hint of how + fast these selections can be, they are up to 10 times faster than a + traditional relational database. Again, see + http://www.pytables.org/docs/SciPy04.pdf for some experiments on that + matter. + +- Support of complex datatypes for all the data objects (i.e. Table, + Array, EArray and VLArray). With that, the complete set of datatypes + of Numeric and numarray packages are supported. Thanks to Tom Hedley + for providing the patches for Array, EArray and VLArray objects, as + well as updating the User's Manual and adding unit tests for the new + functionality. + +- Modification of values. You can modify Table, Array, EArray and + VLArray values. See Table.modifyRows, Table.modifyColumns() and the + newly introduced __setitem__() method for Table, Array, EArray and + VLArray entities in the Library Reference of User's Manual. + +- A new sub-package called "nodes" is there. On it, there will be + included different modules to make more easy working with different + entities (like images, files, ...). The first module that has been + added to this sub-package is "FileNode", whose mission is to enable + the creation of a database of nodes which can be used like regular + opened files in Python. In other words, you can store a set of + files in a PyTables database, and read and write it as you would do + with any other file in Python. Thanks to Ivan Vilata i Balaguer for + contributing this. + +Improvements: +------------- + +- New __len__(self) methods added in Arrays, Tables and Columns. This, + in combination with __getitem__(self,key) allows to better emulate + sequences. + +- Better capabilities to import generic HDF5 files. In particular, + Table objects (in the HDF5_HL naming schema) with "holes" in their + compound type definition are supported. That allows to read certain + files produced by NASA (thanks to Stephen Walton for reporting this). + +- Much improved test units. More than 2000 different tests has been + implemented which accounts for more than 13000 loc (this represents + twice of the PyTables library code itself (!)). + +Backward-incompatible API changes: +---------------------------------- + +- The __call__ special method has been removed from objects File, + Group, Table, Array, EArray and VLArray. Now, you should use + walkNodes() in File and Group and iterrows in Table, Array, EArray + and VLArray to get the same functionality. This would provide better + compatibility with IPython as well. + +'nctoh5', a new importing utility: + +- Jeff Whitaker has contributed a script to easily convert NetCDF + files into HDF5 files using Scientific Python and PyTables. It has + been included and documented as a new utility. + +Bug fixes: +---------- + +- A call to File.flush() now invoke a call to H5Fflush() so to + effectively flushing all the file contents to disk. Thanks to Shack + Toms for reporting this and providing a patch. + +- SF #1054683: Security hole in utils.checkNameValidity(). Reported in + 2004-10-26 by ivilata + +- SF #1049297: Suggestion: new method File.delAttrNode(). Reported in + 2004-10-18 by ivilata + +- SF #1049285: Leak in AttributeSet.__delattr__(). Reported in + 2004-10-18 by ivilata + +- SF #1014298: Wrong method call in examples/tutorial1-2.py. Reported + in 2004-08-23 by ivilata + +- SF #1013202: Cryptic error appending to EArray on RO file. Reported + in 2004-08-21 by ivilata + +- SF #991715: Table.read(field="var1", flavor="List") fails. Reported + in 2004-07-15 by falted + +- SF #988547: Wrong file type assumption in File.__new__. Reported in + 2004-07-10 by ivilata + + +Bon profit!, + +-- Francesc Altet +falted@pytables.org + diff --git a/doc/source/release-notes/RELEASE_NOTES_v1.0.rst b/doc/source/release-notes/RELEASE_NOTES_v1.0.rst new file mode 100644 index 0000000..7644638 --- /dev/null +++ b/doc/source/release-notes/RELEASE_NOTES_v1.0.rst @@ -0,0 +1,218 @@ +============================ + What's new in PyTables 1.0 +============================ + + +:Author: Francesc Altet +:Contact: faltet@carabos.com +:Author: Ivan Vilata i Balaguer +:Contact: ivilata@carabos.com + + +This document details the modifications to PyTables since version 0.9.1. Its +main purpose is help you ensure that your programs will be runnable when you +switch from PyTables 0.9.1 to PyTables 1.0. + + +API additions +============= + +- The new ``Table.col()`` method can be used to get a column from a table as a + ``NumArray`` or ``CharArray`` object. This is preferred over the syntax + ``table['colname']``. + +- The new ``Table.readCoordinates()`` method reads a set of rows given their + indexes into an in-memory object. + +- The new ``Table.readAppend()`` method Append rows fulfilling the condition + to a destination table. + +Backward-incompatible changes +============================= + +- Trying to open a nonexistent file or a file of unknown type raises + ``IOError`` instead of ``RuntimeError``. Using an invalid mode raises + ``ValueError`` instead of ``RuntimeError``. + +- Getting a child node from a closed group raises ``ValueError`` instead of + ``RuntimeError``. + +- Running an action on the wrong type of node now (i.e. using + ``file.listNodes()`` on a leaf) raises a ``TypeError`` instead of a + ``NodeError``. + +- Removing a non-existing child now raises a ``NoSuchNodeError``, instead of + doing nothing. + +- Removing a non-empty child group using ``del group.child`` fails with a + ``NodeError`` instead of recursively doing the removal. This is because of + the potential damage it may cause when used inadvertently. If a recursive + behavior is needed, use the ``_f_remove()`` method of the child node. + +- The `recursive` flag of ``Group._f_walkNodes()`` is ``True`` by default now. + Before it was ``False``. + +- Now, deleting and getting a non-existing attribute raises an + ``AttributeError`` instead of a ``RuntimeError``. + +- Swapped last two arguments of ``File.copyAttrs()`` to match the other + methods. Please use ``File.copyNodeAttrs()`` anyway. + +- Failing to infer the size of a string column raises ``ValueError`` instead + of ``RuntimeError``. + +- Excessive table column name length and number of columns now raise + ``ValueError`` instead of ``IndexError`` and ``NameError``. + +- Excessive table row length now raises ``ValueError`` instead of + ``RuntimeError``. + +- ``table[integer]`` returns a ``numarray.records.Record`` object instead of a + tuple. This was the original behavior before PyTables 0.9 and proved to be + more consistent than the last one (tables do not have an explicit ordering + of columns). + +- Specifying a nonexistent column in ``Table.read()`` raises a ``ValueError`` + instead of a ``LookupError``. + +- When ``start >= stop`` an empty iterator is returned by ``Table.iterrows()`` + instead of an empty ``RecArray``. Thanks to Ashley Walsh for noting this. + +- The interface of ``isHDF5File()`` and ``isPyTablesFile()`` file has been + unified so that they both return true or false values on success and raise + ``HDF5ExtError`` or errors. The true value in ``isPyTablesFile()`` is the + format version string of the file. + +- ``Table.whereIndexed()`` and ``Table.whereInRange()`` are now *private* + methods, since the ``Table.where()`` method is able to choose the most + adequate option. + +- The global variables ``ExtVersion`` and ``HDF5Version`` have been renamed to + ``extVersion`` and ``hdf5Version``, respectively. + +- ``whichLibVersion()`` returns ``None`` on querying unavailable libraries, + and raises ``ValueError`` on unknown ones. + +The following modifications, though being (strictly speaking) modifications of +the API, will most probably not cause compatibility problems (but your mileage +may vary): + +- The default values for ``name`` and ``classname`` arguments in + ``File.getNode()`` are now ``None``, although the empty string is still + allowed for backwards compatibility. File hierarchy manipulation and + attribute handling operations using those arguments have changed to reflect + this. + +- Copy operations (``Group._f_copyChildren()``, ``File.copyChildren()``, + ``File.copyNode()``...) do no longer return a tuple with the new node and + statistics. Instead, they only return the new node, and statistics are + collected via an optional keyword argument. + +- The ``copyFile()`` function in ``File.py`` has changed its signature from:: + + copyFile(srcfilename=None, dstfilename=None, title=None, filters=None, + copyuserattrs=True, overwrite=False, stats=None) + + to:: + + copyFile(srcfilename, dstfilename, overwrite=False, **kwargs) + + Thus, the function allows the same options as ``File.copyFile()``. + +- The ``File.copyFile()`` method has changed its signature from:: + + copyFile(self, dstfilename=None, title=None, filters=None, + copyuserattrs=1, overwrite=0, stats=None): + + to:: + + copyFile(self, dstfilename, overwrite=False, **kwargs) + + This enables this method to pass on arbitrary flags and options supported by + copying methods of inner nodes in the hierarchy. + +- The ``File.copyChildren()`` method has changed its signature from:: + + copyChildren(self, wheresrc, wheredst, recursive=False, filters=None, + copyuserattrs=True, start=0, stop=None, step=1, + overwrite=False, stats=None) + + to:: + + copyChildren(self, srcgroup, dstgroup, overwrite=False, recursive=False, + **kwargs): + + Thus, the function allows the same options as ``Group._f_copyChildren()``. + +- The ``Group._f_copyChildren()`` method has changed its signature from:: + + _f_copyChildren(self, where, recursive=False, filters=None, + copyuserattrs=True, start=0, stop=None, step=1, + overwrite=False, stats=None) + + to:: + + _f_copyChildren(self, dstgroup, overwrite=False, recursive=False, + **kwargs) + + This enables this method to pass on arbitrary flags and options supported by + copying methods of inner nodes in the group. + +- Renamed ``srcFilename`` and ``dstFilename`` arguments in ``copyFile()`` and + ``File.copyFile()`` to ``srcfilename`` and ``dstfilename``, respectively. + Renamed ``whereSrc`` and ``whereDst`` arguments in ``File.copyChildren()`` + to ``wheresrc`` and ``wheredst``, respectively. Renamed ``dstNode`` + argument in ``File.copyAttrs()`` to ``dstnode``. Tose arguments should be + easier to type in interactive sessions (although 99% of the time it is not + necessary to specify them). + +- Renamed ``object`` argument in ``EArray.append()`` to ``sequence``. + +- The ``rows`` argument in ``Table.append()`` is now compulsory. + +- The ``start`` argument in ``Table.removeRows()`` is now compulsory. + + +API refinements +=============== + +- The ``isHDF5()`` function has been deprecated in favor of ``isHDF5File()``. + +- Node attribute-handling methods in ``File`` have been renamed for a better + coherence and understanding of their purpose: + + * ``getAttrNode()`` is now called ``getNodeAttr()`` + * ``setAttrNode()`` is now called ``setNodeAttr()`` + * ``delAttrNode()`` is now called ``delNodeAttr()`` + * ``copyAttrs()`` is now called ``copyNodeAttrs()`` + + They keep their respective signatures, and the old versions still exist for + backwards compatibility, though they issue a ``DeprecationWarning``. + +- Using ``VLArray.append()`` with multiple arguments is now deprecated for its + ambiguity. You should put the arguments in a single sequence object (list, + tuple, array...) and pass it as the only argument. + +- Using ``table['colname']`` is deprecated. Using ``table.col('colname')`` + (with the new ``col()`` method) is preferred. + + +Bug fixes (affecting API) +========================= + +- ``Table.iterrows()`` returns an empty iterator when no rows are selected, + instead of returning ``None``. + + +---- + + **Enjoy data!** + + -- The PyTables Team + + +.. Local Variables: +.. mode: text +.. coding: utf-8 +.. fill-column: 78 +.. End: diff --git a/doc/source/release-notes/RELEASE_NOTES_v1.1.1.rst b/doc/source/release-notes/RELEASE_NOTES_v1.1.1.rst new file mode 100644 index 0000000..2b994a1 --- /dev/null +++ b/doc/source/release-notes/RELEASE_NOTES_v1.1.1.rst @@ -0,0 +1,58 @@ +============================== + What's new in PyTables 1.1.1 +============================== + + +:Author: Francesc Altet +:Contact: faltet@carabos.com +:Author: Ivan Vilata i Balaguer +:Contact: ivilata@carabos.com + + +This document details the modifications to PyTables since version 1.0. +Its main purpose is help you ensure that your programs will be runnable +when you switch from PyTables 1.0 to PyTables 1.1.1. + + +API additions +============= + +- None + +Backward-incompatible changes +============================= + +- ``Table.read()`` raises a ``KeyError`` instead of a ``ValueError`` + when a nonexistent field name is specified, for consistency with other + methods. The same goes for the ``col()`` method. + +- ``File.__contains__()`` returns a true value when it is asked for an + existent node, be it visible or not. This is more consistent with + ``Group.__contains__()``. + + +API refinements +=============== + +- Using ``table.cols['colname']`` is deprecated. The usage of + ``table.cols._f_col('colname')`` (with the new ``Cols._f_col()`` + method) is preferred. + +Bug fixes (affecting API) +========================= + +- None + + +---- + + **Enjoy data!** + + -- The PyTables Team + + +.. Local Variables: +.. mode: text +.. coding: utf-8 +.. fill-column: 72 +.. End: diff --git a/doc/source/release-notes/RELEASE_NOTES_v1.1.rst b/doc/source/release-notes/RELEASE_NOTES_v1.1.rst new file mode 100644 index 0000000..b8011a8 --- /dev/null +++ b/doc/source/release-notes/RELEASE_NOTES_v1.1.rst @@ -0,0 +1,58 @@ +============================ + What's new in PyTables 1.1 +============================ + + +:Author: Francesc Altet +:Contact: faltet@carabos.com +:Author: Ivan Vilata i Balaguer +:Contact: ivilata@carabos.com + + +This document details the modifications to PyTables since version 1.0. Its +main purpose is help you ensure that your programs will be runnable when you +switch from PyTables 1.0 to PyTables 1.1. + + +API additions +============= + +- something... + +Backward-incompatible changes +============================= + +- ``Table.read()`` raises a ``KeyError`` instead of a ``ValueError`` when a + nonexistent field name is specified, for consistency with other methods. + The same goes for the ``col()`` method. + +- ``File.__contains__()`` returns a true value when it is asked for an existent + node, be it visible or not. This is more consistent with + ``Group.__contains__()``. + + +API refinements +=============== + +- Using ``table.cols['colname']`` is deprecated. The usage of + ``table.cols._f_col('colname')`` (with the new ``Cols._f_col()`` method) is + preferred. + +Bug fixes (affecting API) +========================= + +- something... + + +---- + + **Enjoy data!** + + -- The PyTables Team + + +.. Local Variables: +.. mode: text +.. coding: utf-8 +.. fill-column: 78 +.. End: diff --git a/doc/source/release-notes/RELEASE_NOTES_v1.2.1.rst b/doc/source/release-notes/RELEASE_NOTES_v1.2.1.rst new file mode 100644 index 0000000..92f22e2 --- /dev/null +++ b/doc/source/release-notes/RELEASE_NOTES_v1.2.1.rst @@ -0,0 +1,56 @@ +============================== + What's new in PyTables 1.2.1 +============================== + + +:Author: Francesc Altet +:Contact: faltet@carabos.com +:Author: Ivan Vilata i Balaguer +:Contact: ivilata@carabos.com + + +This document details the modifications to PyTables since version 1.2. Its +main purpose is help you ensure that your programs will be runnable when you +switch from PyTables 1.2 to PyTables 1.2.1. + + +API additions +============= + +- None + +Backward-incompatible changes +============================= + +- None + +Deprecated features +=================== + +- None + + +API refinements +=============== + +- None + + +Bug fixes (affecting API) +========================= + +- None + + +---- + + **Enjoy data!** + + -- The PyTables Team + + +.. Local Variables: +.. mode: text +.. coding: utf-8 +.. fill-column: 78 +.. End: diff --git a/doc/source/release-notes/RELEASE_NOTES_v1.2.2.rst b/doc/source/release-notes/RELEASE_NOTES_v1.2.2.rst new file mode 100644 index 0000000..17dcf26 --- /dev/null +++ b/doc/source/release-notes/RELEASE_NOTES_v1.2.2.rst @@ -0,0 +1,56 @@ +============================== + What's new in PyTables 1.2.2 +============================== + + +:Author: Francesc Altet +:Contact: faltet@carabos.com +:Author: Ivan Vilata i Balaguer +:Contact: ivilata@carabos.com + + +This document details the modifications to PyTables since version 1.2. Its +main purpose is help you ensure that your programs will be runnable when you +switch from PyTables 1.2 to PyTables 1.2.2. + + +API additions +============= + +- None + +Backward-incompatible changes +============================= + +- None + +Deprecated features +=================== + +- None + + +API refinements +=============== + +- None + + +Bug fixes (affecting API) +========================= + +- None + + +---- + + **Enjoy data!** + + -- The PyTables Team + + +.. Local Variables: +.. mode: text +.. coding: utf-8 +.. fill-column: 78 +.. End: diff --git a/doc/source/release-notes/RELEASE_NOTES_v1.2.3.rst b/doc/source/release-notes/RELEASE_NOTES_v1.2.3.rst new file mode 100644 index 0000000..1bcff0e --- /dev/null +++ b/doc/source/release-notes/RELEASE_NOTES_v1.2.3.rst @@ -0,0 +1,56 @@ +============================== + What's new in PyTables 1.2.3 +============================== + + +:Author: Francesc Altet +:Contact: faltet@carabos.com +:Author: Ivan Vilata i Balaguer +:Contact: ivilata@carabos.com + + +This document details the modifications to PyTables since version 1.2. Its +main purpose is help you ensure that your programs will be runnable when you +switch from PyTables 1.2 to PyTables 1.2.3. + + +API additions +============= + +- None + +Backward-incompatible changes +============================= + +- None + +Deprecated features +=================== + +- None + + +API refinements +=============== + +- None + + +Bug fixes (affecting API) +========================= + +- None + + +---- + + **Enjoy data!** + + -- The PyTables Team + + +.. Local Variables: +.. mode: text +.. coding: utf-8 +.. fill-column: 78 +.. End: diff --git a/doc/source/release-notes/RELEASE_NOTES_v1.2.rst b/doc/source/release-notes/RELEASE_NOTES_v1.2.rst new file mode 100644 index 0000000..17a20f8 --- /dev/null +++ b/doc/source/release-notes/RELEASE_NOTES_v1.2.rst @@ -0,0 +1,105 @@ +============================ + What's new in PyTables 1.2 +============================ + + +:Author: Francesc Altet +:Contact: faltet@carabos.com +:Author: Ivan Vilata i Balaguer +:Contact: ivilata@carabos.com + + +This document details the modifications to PyTables since version 1.1. Its +main purpose is help you ensure that your programs will be runnable when you +switch from PyTables 1.1 to PyTables 1.2. + + +API additions +============= + +- The user is now allowed to set arbitrary Python (non-persistent) attributes + on any instance of ``Node``. If the name matches that of a child node, the + later will no longer be accessible via natural naming, but it will still be + available via ``File.getNode()``, ``Group._f_getChild()`` and the group + children dictionaries. + + Of course, this allows the user to overwrite internal (``^_[cfgv]_``) + PyTables variables, but this is the way most Python packages work. + +- The new ``Group._f_getChild()`` method allows to get a child node (be it + visible or not) by its name. This should be more intuitive that using + ``getattr()`` or using the group children dictionaries. + +- The new ``File.isVisibleNode()``, ``Node._f_isVisible()`` and + ``Leaf.isVisible()`` methods tell whether a node is visible or not, i.e. if + the node will appear in listing operations such as ``Group._f_listNodes()``. + + +Backward-incompatible changes +============================= + +- ``File.objects``, ``File.groups`` and ``File.leaves`` can no longer be used + to iterate over all the nodes in the file. However, they still may be used + to access any node by its path. + +- ``File.__contains__()`` returns a true value when it is asked for an + existent node, be it visible or not. This is more consistent with + ``Group.__contains__()``. + +- Using ``Group.__delattr__()`` to remove a child is no longer supported. + Please use ``Group._f_remove()`` instead. + +- The ``indexprops`` attribute is now present on all ``Table`` instances, be + they indexed or not. In the last case, it is ``None``. + +- Table.getWhereList() now has flavor parameter equal to "NumArray" by + default, which is more consistent with other methods. Before, flavor + defaulted to "List". + +- The ``extVersion`` variable does no longer exist. It did not make much + sense either, since the canonical version of the whole PyTables package is + that of ``__version__``. + +- The ``Row.nrow()`` has been converted into a property, so you have to + replace any call to ``Row.nrow()`` into ``Row.nrow``. + + +Deprecated features +=================== + +- The ``objects``, ``groups`` and ``leaves`` mappings in ``File`` are retained + only for compatibility purposes. Using ``File.getNode()`` is recommended to + access nodes, ``File.__contains__()`` to check for node existence, and + ``File.walkNodes()`` for iteration purposes. Using ``isinstance()`` and + ``*isVisible*()`` methods is the preferred way of checking node type and + visibility. + + Please note that the aforementioned mappings use the named methods + internally, so the former have no special performance gains over the later. + + +API refinements +=============== + +- The ``isHDF5File()`` and ``isPyTablesFile()`` functions know how to handle + nonexistent or unreadable files. An ``IOError`` is raised in those cases. + + +Bug fixes (affecting API) +========================= + +- None + + +---- + + **Enjoy data!** + + -- The PyTables Team + + +.. Local Variables: +.. mode: text +.. coding: utf-8 +.. fill-column: 78 +.. End: diff --git a/doc/source/release-notes/RELEASE_NOTES_v1.3.1.rst b/doc/source/release-notes/RELEASE_NOTES_v1.3.1.rst new file mode 100644 index 0000000..45b1755 --- /dev/null +++ b/doc/source/release-notes/RELEASE_NOTES_v1.3.1.rst @@ -0,0 +1,66 @@ +============================== + What's new in PyTables 1.3.1 +============================== + + +:Author: Francesc Altet +:Contact: faltet@carabos.com +:Author: Ivan Vilata i Balaguer +:Contact: ivilata@carabos.com + + +This document details the modifications to PyTables since version 1.2. Its +main purpose is help you ensure that your programs will be runnable when you +switch from PyTables 1.2 to PyTables 1.3.1. + + +API additions +============= + +- The Table.Cols accessor has received a new __setitem__() method that + allows doing things like: + + table.cols[4] = record + table.cols.x[4:1000:2] = array # homogeneous column + table.cols.Info[4:1000:2] = recarray # nested column + + +Backward-incompatible changes +============================= + +- None + + +Deprecated features +=================== + +- None + + +API refinements +=============== + +- Table.itersequence has changed the default value for 'sort' parameter. It is + now False by default, as it is not clear if this actually accelerates the + iterator, so it is better to let to the user doing the proper checks (if he + is interested at all). + + +Bug fixes (affecting API) +========================= + +- None + + +---- + + **Enjoy data!** + + -- The PyTables Team + + +.. Local Variables: +.. mode: text +.. coding: utf-8 +.. fill-column: 78 +.. End: diff --git a/doc/source/release-notes/RELEASE_NOTES_v1.3.2.rst b/doc/source/release-notes/RELEASE_NOTES_v1.3.2.rst new file mode 100644 index 0000000..5b6d5bf --- /dev/null +++ b/doc/source/release-notes/RELEASE_NOTES_v1.3.2.rst @@ -0,0 +1,66 @@ +============================== + What's new in PyTables 1.3.2 +============================== + + +:Author: Francesc Altet +:Contact: faltet@carabos.com +:Author: Ivan Vilata i Balaguer +:Contact: ivilata@carabos.com + + +This document details the modifications to PyTables since version 1.2. Its +main purpose is help you ensure that your programs will be runnable when you +switch from PyTables 1.2 to PyTables 1.3.2. + + +API additions +============= + +- The ``Table.Cols`` accessor has received a new ``__setitem__()`` method that + allows doing things like:: + + table.cols[4] = record + table.cols.x[4:1000:2] = array # homogeneous column + table.cols.Info[4:1000:2] = recarray # nested column + + +Backward-incompatible changes +============================= + +- None + + +Deprecated features +=================== + +- None + + +API refinements +=============== + +- ``Table.itersequence()`` has changed the default value for the ``sort`` + parameter. It is now false by default, as it is not clear if this actually + accelerates the iterator, so it is better to let the user do the proper + checks (if interested). + + +Bug fixes (affecting API) +========================= + +- None + + +---- + + **Enjoy data!** + + -- The PyTables Team + + +.. Local Variables: +.. mode: text +.. coding: utf-8 +.. fill-column: 78 +.. End: diff --git a/doc/source/release-notes/RELEASE_NOTES_v1.3.3.rst b/doc/source/release-notes/RELEASE_NOTES_v1.3.3.rst new file mode 100644 index 0000000..0c3519a --- /dev/null +++ b/doc/source/release-notes/RELEASE_NOTES_v1.3.3.rst @@ -0,0 +1,57 @@ +============================== + What's new in PyTables 1.3.3 +============================== + + +:Author: Francesc Altet +:Contact: faltet@carabos.com +:Author: Ivan Vilata i Balaguer +:Contact: ivilata@carabos.com + + +This document details the modifications to PyTables since version 1.2. Its +main purpose is help you ensure that your programs will be runnable when you +switch from PyTables 1.2 to PyTables 1.3.3. + + +API additions +============= + +- None + +Backward-incompatible changes +============================= + +- None + + +Deprecated features +=================== + +- None + + +API refinements +=============== + +- None + + +Bug fixes (affecting API) +========================= + +- None + + +---- + + **Enjoy data!** + + -- The PyTables Team + + +.. Local Variables: +.. mode: text +.. coding: utf-8 +.. fill-column: 78 +.. End: diff --git a/doc/source/release-notes/RELEASE_NOTES_v1.3.rst b/doc/source/release-notes/RELEASE_NOTES_v1.3.rst new file mode 100644 index 0000000..5c31217 --- /dev/null +++ b/doc/source/release-notes/RELEASE_NOTES_v1.3.rst @@ -0,0 +1,66 @@ +============================ + What's new in PyTables 1.3 +============================ + + +:Author: Francesc Altet +:Contact: faltet@carabos.com +:Author: Ivan Vilata i Balaguer +:Contact: ivilata@carabos.com + + +This document details the modifications to PyTables since version 1.2. Its +main purpose is help you ensure that your programs will be runnable when you +switch from PyTables 1.2 to PyTables 1.3. + + +API additions +============= + +- The Table.Cols accessor has received a new __setitem__() method that + allows doing things like: + + table.cols[4] = record + table.cols.x[4:1000:2] = array # homogeneous column + table.cols.Info[4:1000:2] = recarray # nested column + + +Backward-incompatible changes +============================= + +- None + + +Deprecated features +=================== + +- None + + +API refinements +=============== + +- Table.itersequence has changed the default value for 'sort' parameter. It is + now False by default, as it is not clear if this actually accelerates the + iterator, so it is better to let to the user doing the proper checks (if he + is interested at all). + + +Bug fixes (affecting API) +========================= + +- None + + +---- + + **Enjoy data!** + + -- The PyTables Team + + +.. Local Variables: +.. mode: text +.. coding: utf-8 +.. fill-column: 78 +.. End: diff --git a/doc/source/release-notes/RELEASE_NOTES_v1.4.rst b/doc/source/release-notes/RELEASE_NOTES_v1.4.rst new file mode 100644 index 0000000..e933cce --- /dev/null +++ b/doc/source/release-notes/RELEASE_NOTES_v1.4.rst @@ -0,0 +1,71 @@ +============================ + What's new in PyTables 1.4 +============================ + + +:Author: Francesc Altet +:Contact: faltet@carabos.com +:Author: Ivan Vilata i Balaguer +:Contact: ivilata@carabos.com + + +This document details the modifications to PyTables since version 1.3. Its +main purpose is help you ensure that your programs will be runnable when you +switch from PyTables 1.3 to PyTables 1.4. + + +API additions +============= + +- The ``Table.getWhereList()`` method has got a new ``sort`` parameter. The + default now is to get the list of parameters unsorted. Set ``sort`` to True + to get the old behaviour. We've done this to avoid unnecessary ordering of + potentially large sets of coordinates. + +- Node creation, copying and moving operations have received a new optional + `createparents` argument. When true, the necessary groups in the target + path that don't exist at the time of running the operation are automatically + created, so that the target group of the operation always exists. + + +Backward-incompatible changes +============================= + +- None + + +Deprecated features +=================== + +- None + + +API refinements +=============== + +- ``Description._v_walk()`` has been renamed to ``_f_walk()``, since it is a + public method, not a value. + +- ``Table.removeIndex()`` now accepts a column name in addition to an + ``Index`` instance (the later is deprecated). This avoids the user having + to retrieve the needed ``Index`` object. + + +Bug fixes (affecting API) +========================= + +- None + + +---- + + **Enjoy data!** + + -- The PyTables Team + + +.. Local Variables: +.. mode: text +.. coding: utf-8 +.. fill-column: 78 +.. End: diff --git a/doc/source/release-notes/RELEASE_NOTES_v2.0.x-pro.rst b/doc/source/release-notes/RELEASE_NOTES_v2.0.x-pro.rst new file mode 100644 index 0000000..c0c8302 --- /dev/null +++ b/doc/source/release-notes/RELEASE_NOTES_v2.0.x-pro.rst @@ -0,0 +1,495 @@ +=========================================== + Release notes for PyTables Pro 2.0 series +=========================================== + +:Author: Francesc Alted i Abad +:Contact: faltet@pytables.com +:Author: Ivan Vilata i Balaguer +:Contact: ivan@selidor.net + + +Changes from 2.0.3 to 2.0.4 +=========================== + +- Selections in tables works now in threaded environments. The problem was in + the Numexpr package -- the solution has been reported to the upstream + authors too. Fixes #164. + +- PyTables had problems importing native HDF5 files with gaps in nested + compound types. This has been solved. Fixes #173. + +- In order to prevent a bug existing in HDF5 1.6 series, the + ``EArray.truncate()`` method refused to accept a 0 as parameter + (i.e. truncate an existing EArray to have zero rows did not work). As this + has been fixed in the recent HDF5 1.8 series, this limitation has been + removed (but only if the user has one of these installed). Fixes #171. + +- Small fixes for allowing the test suite to pass when using the new NumPy + 1.1. However, it remains a small issue with the way the new NumPy + represents complex numbers. I'm not fixing that in the PyTables suite, as + there are chances that this can be fixed in NumPy itself (see ticket #841). + + +Changes from 2.0.2.1 to 2.0.3 +============================= + +- Replaced the algorithm for computing chunksizes by another that is + more general and useful for a larger range of expected dataset + sizes. The outcome of the new calculation is the same than before + for dataset sizes <= 100 GB. For datasets between 100 GB <= size < + 10 TB, larger values are returned. For sizes >= 10 TB a maximum + value of 1 MB is always returned. + +- Added support for the latest 1.8.0 version of the HDF5 library. + Fixes ticket #127. + +- PyTables compiles now against latest versions of Pyrex (0.9.6.4). For the + first time, the extensions do compile without warnings! Fixes #159. + +- Numexpr module has been put in sync with the version in SciPy sandbox. + +- Added a couple of warnings in User's Guide so as to tell the user that it is + not safe to use methods that can change the number of rows of a table in the + middle of a row iterator. Fixes #153. + +- Fixed a problem when updating multidimensional cells using the + Row.update() method in the middle of table iterators . Fixes #149. + +- Fixed a problem when using 64-bit indexes in 32-bit platforms. + Solves ticket #148. + +- Table.indexFilters is working now as documented. However, as per ticket + #155, its use is now deprecated (will be removed in 2.1). Fixes #155. + + +Changes from 2.0.2 to 2.0.2.1 +============================= + +- Optimization added for avoid to unnecessarily update index columns + that have not been modified in table update operations. Fixes #139. + + +Changes from 2.0.1 to 2.0.2 +=========================== + +- Fixed a critical bug that returned wrong results when doing repetitive + queries affecting the last row part of indices. Fixes #60 of the private + Trac of Carabos. + +- Added ``__enter__()`` and ``__exit__()`` methods to ``File``; fixes #113. + With this, and if using Python 2.5 you can do things like: + + with tables.openFile("test.h5") as h5file: + ... + +- Carefully preserve type when converting NumPy scalar to numarray; fixes + #125. + +- Fixed a nasty bug that appeared when moving or renaming groups due to a bad + interaction between ``Group._g_updateChildrenLocation()`` and the LRU cache. + Solves #126. + +- Return 0 when no rows are given to ``Table.modifyRows()``; fixes #128. + +- Added an informative message when the ``nctoh5`` utility is run without the + NetCDF interface of ScientificPython bening installed. + +- Now, a default representation of closed nodes is provided; fixes #129. + + +Changes from 2.0 to 2.0.1 +========================= + +- The ``coords`` argument of ``Table.readCoords()`` was not checked + for contiguousness, raising fatal errors when it was discontiguous. + This has been fixed. + +- There is an inconsistency in the way used to specify the atom shape + in ``Atom`` constructors. When the shape is specified as + ``shape=()`` it means a scalar atom and when it is specified as + ``shape=N`` it means an atom with ``shape=(N,)``. But when the + shape is specified as ``shape=1`` (i.e. in the default case) then a + scalar atom is obtained instead of an atom with ``shape=(1,)``. + This is inconsistent and not the behavior that NumPy exhibits. + + Changing this will require a migration path which includes + deprecating the old behaviour if we want to make the change happen + before a new major version. The proposed path is: + + 1. In PyTables 2.0.1, we are changing the default value of the + ``shape`` argument to ``()``, and issue a ``DeprecationWarning`` + when someone uses ``shape=1`` stating that, for the time being, + it is equivalent to ``()``, but in near future versions it will + become equivalent to ``(1,)``, and recommending the user to pass + ``shape=()`` if a scalar is desired. + + 2. In PyTables 2.1, we will remove the previous warning and take + ``shape=N`` to mean ``shape=(N,)`` for any value of N. + + See ticket #96 for more info. + +- The info about the ``chunkshape`` attribute of a leaf is now printed + in the ``__repr__()`` of chunked leaves (all except ``Array``). + +- After some scrupulous benchmarking job, the size of the I/O buffer + for ``Table`` objects has been reduced to the minimum that allows + maximum performance. This represents more than 10x of reduction in + size for that buffer, which will benefit those programs dealing with + many tables simultaneously (#109). + +- In the ``ptrepack`` utility, when ``--complevel`` and ``--shuffle`` + were specified at the same time, the 'shuffle' filter was always set + to 'off'. This has been fixed (#104). + +- An ugly bug related with the integrated Numexpr not being aware of + all the variations of data arrangements in recarray objects has been + fixed (#103). We should stress that the bug only affected the + Numexpr version integrated in PyTables, and *not* the original one. + +- When passing a record array to a table at creation time, its real + length is now used instead of the default value for + ``expectedrows``. This allows for better performance (#97). + +- Added some workarounds so that NumPy scalars can be successfully + converted to numarray objects. Fixes #98. + +- PyTables is now able to access table rows beyond 2**31 in 32-bit + Python. The problem was a limitation of ``xrange`` and we have + replaced it by a new ``lrange`` class written in Pyrex. Moreover, + ``lrange`` has been made publicly accessible as a safe 64-bit + replacement for ``xrange`` for 32-bit platforms users. Fixes #99. + +- If a group and a table are created in a function, and the table is + accessed through the group, the table can be flushed now. Fixes + #94. + +- It is now possible to directly assign a field in a nested record of + a table using the natural naming notation (#93). + + +Changes from 2.0rc2 to 2.0 +========================== + +- Added support for recognizing native HDF5 files with datasets compressed + with szip compressor. + +- Fixed a problem when asking for the string representation (str()) of closed + files. Fixes ticket #79. + +- Do not take LZO as available when its initialisation fails. + +- Fixed a glitch in ptrepack utility. When the user wants a copy of a group, + and a group is *to be created* in destination, the attributes of the + original group *are* copied. If it is *not to be created*, the attributes + will *not be* copied. I think this should be what the user would expect most + of the times. + +- Fixed the check for creating intermediate groups in ptrepack utility. + Solves ticket #83. + +- Before, when reading a dataset with an unknown CLASS id, a warning was + issued and the dataset mapped to ``UnImplemented``. This closed the door to + have the opportunity to try to recognize the dataset and map it to a + supported CLASS. Now, when a CLASS attribute is not recognized, an attempt + to recognize its associated dataset is made. If it is recognized, the + matching class is associated with the dataset. If it is not recognized, then + a warning is issued and the dataset becomes mapped to ``UnImplemented``. + +- Always pass verbose and heavy values in the common test module to test(). + Fixes ticket #85. + +- Now, the ``verbose`` and ``--heavy`` flag passed to test_all.py are honored. + +- All the DLL's of dependencies are included now in Windows binaries. This + should allow for better portability of the binaries. + +- Fixed the description of Node._v_objectID that was misleading. + + +Changes from 2.0rc1 to 2.0rc2 +============================= + +- The "Optimization tips" chapter of the User's Guide has been completely + updated to adapt to PyTables 2.0 series. In particular, new benchmarks on + the much improved indexed queries have been included; you will see that + PyTables indexing is competitive (and sometimes much faster) than that of + traditional relational databases. With this, the manual should be fairly + finished for 2.0 final release. + +- Large refactoring done on the ``Row`` class. The most important change is + that ``Table.row`` is now a single object. This allows to reuse the same + ``Row`` instance even after ``Table.flush()`` calls, which can be convenient + in many situations. + +- I/O buffers unified in the ``Row`` class. That allows for bigger savings in + memory space whenever the ``Row`` extension is used. + +- Improved speed (up to a 70%) with unaligned column operations (a quite + common scenario when dealing with ``Table`` objects) through the integrated + Numexpr. In-kernel searches take advantage of this optimization. + +- Added ``VLUnicodeAtom`` for storing variable-length Unicode strings in + ``VLArray`` objects regardless of encoding. Closes ticket #51. + +- Added support for ``time`` datatypes to be portable between big-endian and + low-endian architectures. This feature is not currently supported natively + by the HDF5 library, so the support for such conversion has been added in + PyTables itself. Fixes #72. + +- Added slice arguments to ``Table.readWhere()`` and ``Table.getWhereList()``. + Although API changes are frozen, this may still be seen as an inconsistency + with other query methods. The patch is backwards-compatible anyway. + +- Added missing overwrite argument to ``File.renameNode()`` and + ``Node._f_rename()``. Fixes ticket #66. + +- Calling ``tables.test()`` no longer exits the interpreter session. Fixes + ticket #67. + +- Fix comparing strings where one is a prefix of the other in integrated + Numexpr. Fixes ticket #76. + +- Added a check for avoiding an ugly HDF5 message when copying a file over + itself (for both ``copyFile()`` and ``File.copyFile()``). Fixes ticket #73. + +- Corrected the appendix E, were it was said that PyTables doesn't support + compounds of compounds (it does since version 1.2!). + + +Changes from 2.0b2 to 2.0rc1 +============================ + +- The ``lastrow`` argument of ``Table.flushRowsToIndex()`` is no longer + public. It was not documented, anyway. Fixes ticket #43. + +- Added a ``memlevel`` argument to ``Cols.createIndex()`` which allows the + user to control the amount of memory required for creating an index. + +- Added ``blocksizes`` and ``opts`` arguments to ``Cols.createIndex()``, which + allow the user to control the sizes of index datasets, and to specify + different optimization levels for each index dataset, respectively. These + are very low-level options meant only for experienced users. Normal users + should stick to the higher-level ``memlevel`` and ``optlevel``. + +- Query tests have been tuned to exhaustively check the new parametrization of + indexes. + +- A new algorithm has been implemented that better reduces the entropy of + indexes. + +- The API Reference section of the User's Manual (and the matching docstrings) + has been completely reviewed, expanded and corrected. This process has + unveiled some errors and inconsistencies which have also been fixed. + +- Fixed ``VLArray.__getitem__()`` to behave as expected in Python when using + slices, instead of following the semantics of PyTables' ``read()`` methods + (e.g. reading just one element when no stop is provided). Fixes ticket #50. + +- Removed implicit UTF-8 encoding from ``VLArray`` data using ``vlstring`` + atoms. Now a variable-length string is stored as is, which lets users use + any encoding of their choice, or none of them. A ``vlunicode`` atom will + probably be added to the next release so as to fix ticket #51. + +- Allow non-sequence objects to be passed to ``VLArray.append()`` when using + an ``object`` atom. This was already possible in 1.x but stopped working + when the old append syntax was dropped in 2.0. Fixes ticket #63. + +- Changed ``Cols.__len__()`` to return the number of rows of the table or + nested column (instead of the number of fields), like its counterparts in + ``Table`` and ``Column``. + +- Python scalars cached in ``AttributeSet`` instances are now kept as NumPy + objects instead of Python ones, because they do become NumPy objects when + retrieved from disk. Fixes ticket #59. + +- Avoid HDF5 error when appending an empty array to a ``Table`` (ticket #57) + or ``EArray`` (ticket #49) dataset. + +- Fix wrong implementation of the top-level ``table.description._v_dflts`` + map, which was also including the pathnames of columns inside nested + columns. Fixes ticket #45. + +- Optimized the access to unaligned arrays in Numexpr between a 30% and a 70%. + +- Fixed a die-hard bug that caused the loading of groups while closing a file. + This only showed with certain usage patterns of the LRU cache (e.g. the one + caused by ``ManyNodesTestCase`` in ``test_indexes.py`` under Pro). + +- Avoid copious warnings about unused functions and variables when compiling + Numexpr. + +- Several fixes to Numexpr expressions with all constant values. Fixed + tickets #53, #54, #55, #58. Reported bugs to mainstream developers. + +- Solved an issue when trying to open one of the included test files in append + mode on a system-wide installation by a normal user with no write privileges + on it. The file isn't being modified anyway, so the test is skipped then. + +- Added a new benchmark to compare the I/O speed of ``Array`` and ``EArray`` + objects with that of ``cPickle``. + +- The old ``Row.__call__()`` is no longer available as a public method. It + was not documented, anyway. Fixes ticket #46. + +- ``Cols._f_close()`` is no longer public. Fixes ticket #47. + +- ``Attributes._f_close()`` is no longer public. Fixes ticket #52. + +- The undocumented ``Description.classdict`` attribute has been completely + removed. Fixes ticket #44. + + +Changes from 2.0b1 to 2.0b2 +=========================== + +- A very exhaustive overhauling of the User's Manual is in process. The + chapters 1 (Introduction), 2 (Installation), 3 (Tutorials) have been + completed (and hopefully, the lines of code are easier to copy&paste now), + while chapter 4 (API Reference) has been done up to (and including) the + Table class. During this tedious (but critical in a library) overhauling + work, we have tried hard to synchronize the text in the User's Guide with + that which appears on the docstrings. + +- Removed the ``recursive`` argument in ``Group._f_walkNodes()``. Using it + with a false value was redundant with ``Group._f_iterNodes()``. Fixes + ticket #42. + +- Removed the ``coords`` argument from ``Table.read()``. It was undocumented + and redundant with ``Table.readCoordinates()``. Fixes ticket #41. + +- Fixed the signature of ``Group.__iter__()`` (by removing its parameters). + +- Added new ``Table.coldescrs`` and ``Table.description._v_itemsize`` + attributes. + +- Added a couple of new attributes for leaves: + + * ``nrowsinbuf``: the number of rows that fit in the internal buffers. + * ``chunkshape``: the chunk size for chunked datasets. + +- Fixed setuptools so that making an egg out of the PyTables 2 package is + possible now. + +- Added a new ``tables.restrict_flavors()`` function allowing to restrict + available flavors to a given set. This can be useful e.g. if you want to + force PyTables to get NumPy data out of an old, ``numarray``-flavored + PyTables file even if the ``numarray`` package is installed. + +- Fixed a bug which caused filters of unavailable compression libraries to be + loaded as using the default Zlib library, after issuing a warning. Added a + new ``FiltersWarning`` and a ``Filters.copy()``. + + +Changes from 1.4.x to 2.0b1 +=========================== + +API additions +------------- + +- ``Column.createIndex()`` has received a couple of new parameters: + ``optlevel`` and ``filters``. The first one sets the desired quality level + of the index, while the second one allows the user to specify the filters + for the index. + +- ``Table.indexprops`` has been split into ``Table.indexFilters`` and + ``Table.autoIndex``. The later groups the functionality of the old ``auto`` + and ``reindex``. + +- The new ``Table.colpathnames`` is a sequence which contains the full + pathnames of all bottom-level columns in a table. This can be used to walk + all ``Column`` objects in a table when used with ``Table.colinstances``. + +- The new ``Table.colinstances`` dictionary maps column pathnames to their + associated ``Column`` or ``Cols`` object for simple or nested columns, + respectively. This is similar to ``Table.cols._f_col()``, but faster. + +- ``Row`` has received a new ``Row.fetch_all_fields()`` method in order to + return all the fields in the current row. This returns a NumPy void scalar + for each call. + +- New ``tables.test(verbose=False, heavy=False)`` high level function for + interactively running the complete test suite from the Python console. + +- Added a ``tables.print_versions()`` for easily getting the versions for all + the software on which PyTables relies on. + +Backward-incompatible changes +----------------------------- + +- You can no longer mark a column for indexing in a ``Col`` declaration. The + only way of creating an index for a column is to invoke the + ``createIndex()`` method of the proper column object *after the table has + been created*. + +- Now the ``Table.colnames`` attribute is just a list of the names of + top-level columns in a table. You can still get something similar to the + old structure by using ``Table.description._v_nestedNames``. See also the + new ``Table.colpathnames`` attribute. + +- The ``File.objects``, ``File.leaves`` and ``File.groups`` dictionaries have + been removed. If you still need this functionality, please use the + ``File.getNode()`` and ``File.walkNodes()`` instead. + +- ``Table.removeIndex()`` is no longer available; to remove an index on a + column, one must use the ``removeIndex()`` method of the associated + ``Column`` instance. + +- ``Column.dirty`` is no longer available. If you want to check + column index dirtiness, use ``Column.index.dirty``. + +- ``complib`` and ``complevel`` parameters have been removed from + ``File.createTable()``, ``File.createEArray()``, ``File.createCArray()`` and + ``File.createVLArray()``. They were already deprecated in PyTables 1.x. + +- The ``shape`` and ``atom`` parameters have been swapped in + ``File.createCArray()``. This has been done to be consistent with + ``Atom()`` definitions (i.e. type comes before and shape after). + +Deprecated features +------------------- + +- ``Node._v_rootgroup`` has been removed. Please use ``node._v_file.root`` + instead. + +- The ``Node._f_isOpen()`` and ``Leaf.isOpen()`` methods have been removed. + Please use the ``Node._v_isopen`` attribute instead (it is much faster). + +- The ``File.getAttrNode()``, ``File.setAttrNode()`` and + ``File.delAttrNode()`` methods have been removed. Please use + ``File.getNodeAttr()``, ``File.setNodeAttr()`` and ``File.delNodeAttr()`` + instead. + +- ``File.copyAttrs()`` has been removed. Please use ``File.copyNodeAttrs()`` + instead. + +- The ``table[colname]`` idiom is no longer supported. You can use + ``table.cols._f_col(column)`` for doing the same. + +API refinements +--------------- + +- ``File.createEArray()`` received a new ``shape`` parameter. This allows to + not have to use the shape of the atom so as to set the shape of the + underlying dataset on disk. + +- All the leaf constructors have received a new ``chunkshape`` parameter that + allows specifying the chunk sizes of datasets on disk. + +- All ``File.create*()`` factories for ``Leaf`` nodes have received a new + ``byteorder`` parameter that allows the user to specify the byteorder in + which data will be written to disk (data in memory is now always handled in + *native* order). + + +---- + + **Enjoy data!** + + -- The PyTables Team + + +.. Local Variables: +.. mode: rst +.. coding: utf-8 +.. fill-column: 78 +.. End: diff --git a/doc/source/release-notes/RELEASE_NOTES_v2.0.x.rst b/doc/source/release-notes/RELEASE_NOTES_v2.0.x.rst new file mode 100644 index 0000000..a1ef0c4 --- /dev/null +++ b/doc/source/release-notes/RELEASE_NOTES_v2.0.x.rst @@ -0,0 +1,461 @@ +======================================= + Release notes for PyTables 2.0 series +======================================= + +:Author: Francesc Alted i Abad +:Contact: faltet@pytables.com +:Author: Ivan Vilata i Balaguer +:Contact: ivan@selidor.net + + +Changes from 2.0.3 to 2.0.4 +=========================== + +- Selections in tables works now in threaded environments. The problem was in + the Numexpr package -- the solution has been reported to the upstream + authors too. Fixes #164. + +- PyTables had problems importing native HDF5 files with gaps in nested + compound types. This has been solved. Fixes #173. + +- In order to prevent a bug existing in HDF5 1.6 series, the + ``EArray.truncate()`` method refused to accept a 0 as parameter + (i.e. truncate an existing EArray to have zero rows did not work). As this + has been fixed in the recent HDF5 1.8 series, this limitation has been + removed (but only if the user has one of these installed). Fixes #171. + +- Small fixes for allowing the test suite to pass when using the new NumPy + 1.1. However, it remains a small issue with the way the new NumPy + represents complex numbers. I'm not fixing that in the PyTables suite, as + there are chances that this can be fixed in NumPy itself (see ticket #841). + + +Changes from 2.0.2 to 2.0.3 +=========================== + +- Replaced the algorithm for computing chunksizes by another that is + more general and useful for a larger range of expected dataset sizes. + The outcome of the new calculation is the same than before for + dataset sizes <= 100 GB. For datasets between 100 GB <= size < 10 + TB, larger values are returned. For sizes >= 10 TB a maximum value + of 1 MB is always returned. + +- Fixed a problem when updating multidimensional cells using the + Row.update() method in the middle of table iterators . Fixes #149. + +- Added support for the latest 1.8.0 version of the HDF5 library. + Fixes ticket #127. + +- PyTables compiles now against latest versions of Pyrex (0.9.6.4). For the + first time, the extensions do compile without warnings! Fixes #159. + +- Numexpr module has been put in sync with the version in SciPy sandbox. + +- Added a couple of warnings in User's Guide so as to tell the user that it is + not safe to use methods that can change the number of rows of a table in the + middle of a row iterator. Fixes #153. + + +Changes from 2.0.1 to 2.0.2 +=========================== + +- Added ``__enter__()`` and ``__exit__()`` methods to ``File``; fixes #113. + With this, and if using Python 2.5 you can do things like: + + with tables.openFile("test.h5") as h5file: + ... + +- Carefully preserve type when converting NumPy scalar to numarray; fixes + #125. + +- Fixed a nasty bug that appeared when moving or renaming groups due to a bad + interaction between ``Group._g_updateChildrenLocation()`` and the LRU cache. + Solves #126. + +- Return 0 when no rows are given to ``Table.modifyRows()``; fixes #128. + +- Added an informative message when the ``nctoh5`` utility is run without the + NetCDF interface of ScientificPython being installed. + +- Now, a default representation of closed nodes is provided; fixes #129. + + +Changes from 2.0 to 2.0.1 +========================= + +- The ``coords`` argument of ``Table.readCoords()`` was not checked + for contiguousness, raising fatal errors when it was discontiguous. + This has been fixed. + +- There is an inconsistency in the way used to specify the atom shape + in ``Atom`` constructors. When the shape is specified as + ``shape=()`` it means a scalar atom and when it is specified as + ``shape=N`` it means an atom with ``shape=(N,)``. But when the + shape is specified as ``shape=1`` (i.e. in the default case) then a + scalar atom is obtained instead of an atom with ``shape=(1,)``. + This is inconsistent and not the behavior that NumPy exhibits. + + Changing this will require a migration path which includes + deprecating the old behaviour if we want to make the change happen + before a new major version. The proposed path is: + + 1. In PyTables 2.0.1, we are changing the default value of the + ``shape`` argument to ``()``, and issue a ``DeprecationWarning`` + when someone uses ``shape=1`` stating that, for the time being, + it is equivalent to ``()``, but in near future versions it will + become equivalent to ``(1,)``, and recommending the user to pass + ``shape=()`` if a scalar is desired. + + 2. In PyTables 2.1, we will remove the previous warning and take + ``shape=N`` to mean ``shape=(N,)`` for any value of N. + + See ticket #96 for more info. + +- The info about the ``chunkshape`` attribute of a leaf is now printed + in the ``__repr__()`` of chunked leaves (all except ``Array``). + +- After some scrupulous benchmarking job, the size of the I/O buffer + for ``Table`` objects has been reduced to the minimum that allows + maximum performance. This represents more than 10x of reduction in + size for that buffer, which will benefit those programs dealing with + many tables simultaneously (#109). + +- In the ``ptrepack`` utility, when ``--complevel`` and ``--shuffle`` + were specified at the same time, the 'shuffle' filter was always set + to 'off'. This has been fixed (#104). + +- An ugly bug related with the integrated Numexpr not being aware of + all the variations of data arrangements in recarray objects has been + fixed (#103). We should stress that the bug only affected the + Numexpr version integrated in PyTables, and *not* the original one. + +- When passing a record array to a table at creation time, its real + length is now used instead of the default value for + ``expectedrows``. This allows for better performance (#97). + +- Added some workarounds so that NumPy scalars can be successfully + converted to numarray objects. Fixes #98. + +- PyTables is now able to access table rows beyond 2**31 in 32-bit + Python. The problem was a limitation of ``xrange`` and we have + replaced it by a new ``lrange`` class written in Pyrex. Moreover, + ``lrange`` has been made publicly accessible as a safe 64-bit + replacement for ``xrange`` for 32-bit platforms users. Fixes #99. + +- If a group and a table are created in a function, and the table is + accessed through the group, the table can be flushed now. Fixes + #94. + +- It is now possible to directly assign a field in a nested record of + a table using the natural naming notation (#93). + + +Changes from 2.0rc2 to 2.0 +========================== + +- Added support for recognizing native HDF5 files with datasets compressed + with szip compressor. + +- Fixed a problem when asking for the string representation (str()) of closed + files. Fixes ticket #79. + +- Do not take LZO as available when its initialisation fails. + +- Fixed a glitch in ptrepack utility. When the user wants a copy of a group, + and a group is *to be created* in destination, the attributes of the + original group *are* copied. If it is *not to be created*, the attributes + will *not be* copied. I think this should be what the user would expect most + of the times. + +- Fixed the check for creating intermediate groups in ptrepack utility. + Solves ticket #83. + +- Before, when reading a dataset with an unknown CLASS id, a warning was + issued and the dataset mapped to ``UnImplemented``. This closed the door to + have the opportunity to try to recognize the dataset and map it to a + supported CLASS. Now, when a CLASS attribute is not recognized, an attempt + to recognize its associated dataset is made. If it is recognized, the + matching class is associated with the dataset. If it is not recognized, then + a warning is issued and the dataset becomes mapped to ``UnImplemented``. + +- Always pass verbose and heavy values in the common test module to test(). + Fixes ticket #85. + +- Now, the ``verbose`` and ``--heavy`` flag passed to test_all.py are honored. + +- All the DLL's of dependencies are included now in Windows binaries. This + should allow for better portability of the binaries. + +- Fixed the description of Node._v_objectID that was misleading. + + +Changes from 2.0rc1 to 2.0rc2 +============================= + +- The "Optimization tips" chapter of the User's Guide has been completely + updated to adapt to PyTables 2.0 series. In particular, new benchmarks on + the much improved indexed queries have been included; you will see that + PyTables indexing is competitive (and sometimes much faster) than that of + traditional relational databases. With this, the manual should be fairly + finished for 2.0 final release. + +- Large refactoring done on the ``Row`` class. The most important change is + that ``Table.row`` is now a single object. This allows to reuse the same + ``Row`` instance even after ``Table.flush()`` calls, which can be convenient + in many situations. + +- I/O buffers unified in the ``Row`` class. That allows for bigger savings in + memory space whenever the ``Row`` extension is used. + +- Improved speed (up to a 70%) with unaligned column operations (a quite + common scenario when dealing with ``Table`` objects) through the integrated + Numexpr. In-kernel searches take advantage of this optimization. + +- Added ``VLUnicodeAtom`` for storing variable-length Unicode strings in + ``VLArray`` objects regardless of encoding. Closes ticket #51. + +- Added support for ``time`` datatypes to be portable between big-endian and + low-endian architectures. This feature is not currently supported natively + by the HDF5 library, so the support for such conversion has been added in + PyTables itself. Fixes #72. + +- Added slice arguments to ``Table.readWhere()`` and ``Table.getWhereList()``. + Although API changes are frozen, this may still be seen as an inconsistency + with other query methods. The patch is backwards-compatible anyway. + +- Added missing overwrite argument to ``File.renameNode()`` and + ``Node._f_rename()``. Fixes ticket #66. + +- Calling ``tables.test()`` no longer exits the interpreter session. Fixes + ticket #67. + +- Fix comparing strings where one is a prefix of the other in integrated + Numexpr. Fixes ticket #76. + +- Added a check for avoiding an ugly HDF5 message when copying a file over + itself (for both ``copyFile()`` and ``File.copyFile()``). Fixes ticket #73. + +- Corrected the appendix E, were it was said that PyTables doesn't support + compounds of compounds (it does since version 1.2!). + + +Changes from 2.0b2 to 2.0rc1 +============================ + +- The API Reference section of the User's Manual (and the matching docstrings) + has been completely reviewed, expanded and corrected. This process has + unveiled some errors and inconsistencies which have also been fixed. + +- Fixed ``VLArray.__getitem__()`` to behave as expected in Python when using + slices, instead of following the semantics of PyTables' ``read()`` methods + (e.g. reading just one element when no stop is provided). Fixes ticket #50. + +- Removed implicit UTF-8 encoding from ``VLArray`` data using ``vlstring`` + atoms. Now a variable-length string is stored as is, which lets users use + any encoding of their choice, or none of them. A ``vlunicode`` atom will + probably be added to the next release so as to fix ticket #51. + +- Allow non-sequence objects to be passed to ``VLArray.append()`` when using + an ``object`` atom. This was already possible in 1.x but stopped working + when the old append syntax was dropped in 2.0. Fixes ticket #63. + +- Changed ``Cols.__len__()`` to return the number of rows of the table or + nested column (instead of the number of fields), like its counterparts in + ``Table`` and ``Column``. + +- Python scalars cached in ``AttributeSet`` instances are now kept as NumPy + objects instead of Python ones, because they do become NumPy objects when + retrieved from disk. Fixes ticket #59. + +- Avoid HDF5 error when appending an empty array to a ``Table`` (ticket #57) + or ``EArray`` (ticket #49) dataset. + +- Fix wrong implementation of the top-level ``table.description._v_dflts`` + map, which was also including the pathnames of columns inside nested + columns. Fixes ticket #45. + +- Optimized the access to unaligned arrays in Numexpr between a 30% and a 70%. + +- Fixed a die-hard bug that caused the loading of groups while closing a file. + This only showed with certain usage patterns of the LRU cache (e.g. the one + caused by ``ManyNodesTestCase`` in ``test_indexes.py`` under Pro). + +- Avoid copious warnings about unused functions and variables when compiling + Numexpr. + +- Several fixes to Numexpr expressions with all constant values. Fixed + tickets #53, #54, #55, #58. Reported bugs to mainstream developers. + +- Solved an issue when trying to open one of the included test files in append + mode on a system-wide installation by a normal user with no write privileges + on it. The file isn't being modified anyway, so the test is skipped then. + +- Added a new benchmark to compare the I/O speed of ``Array`` and ``EArray`` + objects with that of ``cPickle``. + +- The old ``Row.__call__()`` is no longer available as a public method. It + was not documented, anyway. Fixes ticket #46. + +- ``Cols._f_close()`` is no longer public. Fixes ticket #47. + +- ``Attributes._f_close()`` is no longer public. Fixes ticket #52. + +- The undocumented ``Description.classdict`` attribute has been completely + removed. Fixes ticket #44. + + +Changes from 2.0b1 to 2.0b2 +=========================== + +- A very exhaustive overhauling of the User's Manual is in process. The + chapters 1 (Introduction), 2 (Installation), 3 (Tutorials) have been + completed (and hopefully, the lines of code are easier to copy&paste now), + while chapter 4 (API Reference) has been done up to (and including) the + Table class. During this tedious (but critical in a library) overhauling + work, we have tried hard to synchronize the text in the User's Guide with + that which appears on the docstrings. + +- Removed the ``recursive`` argument in ``Group._f_walkNodes()``. Using it + with a false value was redundant with ``Group._f_iterNodes()``. Fixes + ticket #42. + +- Removed the ``coords`` argument from ``Table.read()``. It was undocumented + and redundant with ``Table.readCoordinates()``. Fixes ticket #41. + +- Fixed the signature of ``Group.__iter__()`` (by removing its parameters). + +- Added new ``Table.coldescrs`` and ``Table.description._v_itemsize`` + attributes. + +- Added a couple of new attributes for leaves: + + * ``nrowsinbuf``: the number of rows that fit in the internal buffers. + * ``chunkshape``: the chunk size for chunked datasets. + +- Fixed setuptools so that making an egg out of the PyTables 2 package is + possible now. + +- Added a new ``tables.restrict_flavors()`` function allowing to restrict + available flavors to a given set. This can be useful e.g. if you want to + force PyTables to get NumPy data out of an old, ``numarray``-flavored + PyTables file even if the ``numarray`` package is installed. + +- Fixed a bug which caused filters of unavailable compression libraries to be + loaded as using the default Zlib library, after issuing a warning. Added a + new ``FiltersWarning`` and a ``Filters.copy()``. + + +Important changes from 1.4.x to 2.0 +=================================== + +API additions +------------- + +- ``Column.createIndex()`` has received a couple of new parameters: + ``optlevel`` and ``filters``. The first one sets the desired quality level + of the index, while the second one allows the user to specify the filters + for the index. + +- ``Table.indexprops`` has been split into ``Table.indexFilters`` and + ``Table.autoIndex``. The later groups the functionality of the old ``auto`` + and ``reindex``. + +- The new ``Table.colpathnames`` is a sequence which contains the full + pathnames of all bottom-level columns in a table. This can be used to walk + all ``Column`` objects in a table when used with ``Table.colinstances``. + +- The new ``Table.colinstances`` dictionary maps column pathnames to their + associated ``Column`` or ``Cols`` object for simple or nested columns, + respectively. This is similar to ``Table.cols._f_col()``, but faster. + +- ``Row`` has received a new ``Row.fetch_all_fields()`` method in order to + return all the fields in the current row. This returns a NumPy void scalar + for each call. + +- New ``tables.test(verbose=False, heavy=False)`` high level function for + interactively running the complete test suite from the Python console. + +- Added a ``tables.print_versions()`` for easily getting the versions for all + the software on which PyTables relies on. + + +Backward-incompatible changes +----------------------------- + +- You can no longer mark a column for indexing in a ``Col`` declaration. The + only way of creating an index for a column is to invoke the + ``createIndex()`` method of the proper column object *after the table has + been created*. + +- Now the ``Table.colnames`` attribute is just a list of the names of + top-level columns in a table. You can still get something similar to the + old structure by using ``Table.description._v_nestedNames``. See also the + new ``Table.colpathnames`` attribute. + +- The ``File.objects``, ``File.leaves`` and ``File.groups`` dictionaries have + been removed. If you still need this functionality, please use the + ``File.getNode()`` and ``File.walkNodes()`` instead. + +- ``Table.removeIndex()`` is no longer available; to remove an index on a + column, one must use the ``removeIndex()`` method of the associated + ``Column`` instance. + +- ``Column.dirty`` is no longer available. If you want to check + column index dirtiness, use ``Column.index.dirty``. + +- ``complib`` and ``complevel`` parameters have been removed from + ``File.createTable()``, ``File.createEArray()``, ``File.createCArray()`` and + ``File.createVLArray()``. They were already deprecated in PyTables 1.x. + +- The ``shape`` and ``atom`` parameters have been swapped in + ``File.createCArray()``. This has been done to be consistent with + ``Atom()`` definitions (i.e. type comes before and shape after). + +Deprecated features +------------------- + +- ``Node._v_rootgroup`` has been removed. Please use ``node._v_file.root`` + instead. + +- The ``Node._f_isOpen()`` and ``Leaf.isOpen()`` methods have been removed. + Please use the ``Node._v_isopen`` attribute instead (it is much faster). + +- The ``File.getAttrNode()``, ``File.setAttrNode()`` and + ``File.delAttrNode()`` methods have been removed. Please use + ``File.getNodeAttr()``, ``File.setNodeAttr()`` and ``File.delNodeAttr()`` + instead. + +- ``File.copyAttrs()`` has been removed. Please use ``File.copyNodeAttrs()`` + instead. + +- The ``table[colname]`` idiom is no longer supported. You can use + ``table.cols._f_col(column)`` for doing the same. + +API refinements +--------------- + +- ``File.createEArray()`` received a new ``shape`` parameter. This allows to + not have to use the shape of the atom so as to set the shape of the + underlying dataset on disk. + +- All the leaf constructors have received a new ``chunkshape`` parameter that + allows specifying the chunk sizes of datasets on disk. + +- All ``File.create*()`` factories for ``Leaf`` nodes have received a new + ``byteorder`` parameter that allows the user to specify the byteorder in + which data will be written to disk (data in memory is now always handled in + *native* order). + + +---- + + **Enjoy data!** + + -- The PyTables Team + + +.. Local Variables: +.. mode: rst +.. coding: utf-8 +.. fill-column: 78 +.. End: diff --git a/doc/source/release-notes/RELEASE_NOTES_v2.1.x-pro.rst b/doc/source/release-notes/RELEASE_NOTES_v2.1.x-pro.rst new file mode 100644 index 0000000..a8a1c66 --- /dev/null +++ b/doc/source/release-notes/RELEASE_NOTES_v2.1.x-pro.rst @@ -0,0 +1,114 @@ +======================================= + Release notes for PyTables 2.1 series +======================================= + +:Author: Francesc Alted i Abad +:Contact: faltet@pytables.org + + +Changes from 2.1.1 to 2.1.2 +=========================== + +Bug fixes +--------- + +- Solved problems with Table.modifyColumn() when the column(s) is + multidimensional. Fixes #228. + +- The row attribute of a table seems stalled after a table move or + rename. Fixes #224. + +- Fixed a problem with ``len(array)`` in 32-bit platforms when array + is large enough (> 2**31). + +- Added missing `_c_classId` attribute to the `UnImplemented` class. + ``ptrepack`` no longer chokes while copying `Unimplemented` classes. + +- The ``FIELD_*`` sys attrs are no longer copied when the + ``PYTABLES_SYS_ATTRS`` parameter is set to false. + +- The ``FILTERS`` attribute is not added anymore when + ``PYTABLES_SYS_ATTR`` parameter is set to false. + +- Disable the printing of Unicode characters that cannot be printed on + win32 platform. Fixes #235. + +Other changes +------------- + +- When retrieving a row of a 1-dimensional array, a 0-dim array was + returned instead of a numpy scalar. Now, an actual numpy scalar is + returned. Closes #222. + +- LZO and bzip2 filters adapted to an API fix introduced in HDF5 + 1.8.3. Closes #225. + +- Unsupported HDF5 types in attributes are no longer transferred + during copies. A new `_v_unimplemented` list have been added in + `AttributeSet` class so as to keep track of such attributes. Closes + #240. + +- LZO binaries have disappeared from the GnuWin32 repository. Until + they come eventually back, they have been put at + http://www.pytables.org/download/lzo-win. This has been documented + in the install chapter. + + +Changes from 2.1 to 2.1.1 +========================= + +Bug fixes +--------- + +- Fixed a memory leak when a lot of queries were made. Closes #203 + and #207. + +- The chunkshape="auto" parameter value of `Leaf.copy()` is honored + now, even when the (start, stop, step) parameters are specified. + Closes #204. + +- Due to a flaw in its design, the `File` class was not able to be + subclassed. This has been fixed. Closes #205. + +- Default values were not correctly retrieved when opening already + created CArray/EArray objects. Fixed. Closes #212. + +- Fixed a problem with the installation of the ``nctoh5`` script that + prevented it from being executed. Closes #215. + +- [Pro] The ``iterseq`` cache ignored non-indexed conditions, giving + wrong results when those appeared in condition expressions. This + has been fixed. Closes #206. + +Other changes +------------- + +- `openFile()`, `isHDF5File()` and `isPyTablesFile()` functions accept + Unicode filenames now. Closes #202 and #214. + +- When creating large type sizes (exceeding 64 KB), HDF5 complained + and refused to do so. The HDF5 team has logged the issue as a bug, + but meanwhile it has been implemented a workaround in PyTables that + allows to create such large datatypes for situations that does not + require defaults other than zero. Addresses #211. + +- In order to be consistent with how are stored the other data types, + Unicode attributes are retrieved now as NumPy scalars instead of + Python Unicode strings or NumPy arrays. For the moment, I've fixed + this through pickling the Unicode strings. In the future, when HDF5 + 1.8.x series would be a requirement, that should be done via a HDF5 + native Unicode type. Closes #213. + + +---- + + **Enjoy data!** + + -- The PyTables Team + + +.. Local Variables: +.. mode: rst +.. coding: utf-8 +.. fill-column: 72 +.. End: diff --git a/doc/source/release-notes/RELEASE_NOTES_v2.1.x.rst b/doc/source/release-notes/RELEASE_NOTES_v2.1.x.rst new file mode 100644 index 0000000..a8a1c66 --- /dev/null +++ b/doc/source/release-notes/RELEASE_NOTES_v2.1.x.rst @@ -0,0 +1,114 @@ +======================================= + Release notes for PyTables 2.1 series +======================================= + +:Author: Francesc Alted i Abad +:Contact: faltet@pytables.org + + +Changes from 2.1.1 to 2.1.2 +=========================== + +Bug fixes +--------- + +- Solved problems with Table.modifyColumn() when the column(s) is + multidimensional. Fixes #228. + +- The row attribute of a table seems stalled after a table move or + rename. Fixes #224. + +- Fixed a problem with ``len(array)`` in 32-bit platforms when array + is large enough (> 2**31). + +- Added missing `_c_classId` attribute to the `UnImplemented` class. + ``ptrepack`` no longer chokes while copying `Unimplemented` classes. + +- The ``FIELD_*`` sys attrs are no longer copied when the + ``PYTABLES_SYS_ATTRS`` parameter is set to false. + +- The ``FILTERS`` attribute is not added anymore when + ``PYTABLES_SYS_ATTR`` parameter is set to false. + +- Disable the printing of Unicode characters that cannot be printed on + win32 platform. Fixes #235. + +Other changes +------------- + +- When retrieving a row of a 1-dimensional array, a 0-dim array was + returned instead of a numpy scalar. Now, an actual numpy scalar is + returned. Closes #222. + +- LZO and bzip2 filters adapted to an API fix introduced in HDF5 + 1.8.3. Closes #225. + +- Unsupported HDF5 types in attributes are no longer transferred + during copies. A new `_v_unimplemented` list have been added in + `AttributeSet` class so as to keep track of such attributes. Closes + #240. + +- LZO binaries have disappeared from the GnuWin32 repository. Until + they come eventually back, they have been put at + http://www.pytables.org/download/lzo-win. This has been documented + in the install chapter. + + +Changes from 2.1 to 2.1.1 +========================= + +Bug fixes +--------- + +- Fixed a memory leak when a lot of queries were made. Closes #203 + and #207. + +- The chunkshape="auto" parameter value of `Leaf.copy()` is honored + now, even when the (start, stop, step) parameters are specified. + Closes #204. + +- Due to a flaw in its design, the `File` class was not able to be + subclassed. This has been fixed. Closes #205. + +- Default values were not correctly retrieved when opening already + created CArray/EArray objects. Fixed. Closes #212. + +- Fixed a problem with the installation of the ``nctoh5`` script that + prevented it from being executed. Closes #215. + +- [Pro] The ``iterseq`` cache ignored non-indexed conditions, giving + wrong results when those appeared in condition expressions. This + has been fixed. Closes #206. + +Other changes +------------- + +- `openFile()`, `isHDF5File()` and `isPyTablesFile()` functions accept + Unicode filenames now. Closes #202 and #214. + +- When creating large type sizes (exceeding 64 KB), HDF5 complained + and refused to do so. The HDF5 team has logged the issue as a bug, + but meanwhile it has been implemented a workaround in PyTables that + allows to create such large datatypes for situations that does not + require defaults other than zero. Addresses #211. + +- In order to be consistent with how are stored the other data types, + Unicode attributes are retrieved now as NumPy scalars instead of + Python Unicode strings or NumPy arrays. For the moment, I've fixed + this through pickling the Unicode strings. In the future, when HDF5 + 1.8.x series would be a requirement, that should be done via a HDF5 + native Unicode type. Closes #213. + + +---- + + **Enjoy data!** + + -- The PyTables Team + + +.. Local Variables: +.. mode: rst +.. coding: utf-8 +.. fill-column: 72 +.. End: diff --git a/doc/source/release-notes/RELEASE_NOTES_v2.2.x-pro.rst b/doc/source/release-notes/RELEASE_NOTES_v2.2.x-pro.rst new file mode 100644 index 0000000..4067b7d --- /dev/null +++ b/doc/source/release-notes/RELEASE_NOTES_v2.2.x-pro.rst @@ -0,0 +1,426 @@ +======================================= + Release notes for PyTables 2.2 series +======================================= + +:Author: Francesc Alted i Abad +:Contact: faltet@pytables.org + + +Changes from 2.2.1rc1 to 2.2.1 +============================== + +- The `Row` accessor implements a new `__contains__` special method that + allows doing things like:: + + for row in table: + if item in row: + print "Value found in row", row.nrow + break + + Closes #309. + +- PyTables is more friendly with easy_install and pip now, as all the + Python dependencies should be installed automatically. Closes #298. + + +Changes from 2.2 to 2.2.1rc1 +============================ + +- When using `ObjectAtom` objects in `VLArrays` the ``HIGHEST_PROTOCOL`` + is used for pickling objects. For NumPy arrays, this simple change + leads to space savings up to 3x and time improvements up to 30x. + Closes #301. + +- The `Row` accessor implements a new `__contains__` special method that + allows doing things like:: + + for row in table: + if item in row: + print "Value found in row", row.nrow + break + + Closes #309. + +- tables.Expr can perform operations on scalars now. Thanks to Gaëtan + de Menten for providing a patch for this. Closes #287. + +- Fixed a problem with indexes larger than 32-bit on leaf objects on + 32-bit machines. Fixes #283. + +- Merged in Blosc 1.1.2 for fixing a problem with large datatypes and + subprocess issues. Closes #288 and #295. + +- Due to the adoption of Blosc 1.1.2, the pthreads-win32 library + dependency is dropped on Windows platforms. + +- Fixed a problem with tables.Expr and operands with vary large + rowsizes. Closes #300. + +- ``leaf[numpy.array[scalar]]`` idiom returns a NumPy array instead of + an scalar. This has been done for compatibility with NumPy. Closes + #303. + +- Optimization for `Table.copy()` so that ``FIELD_*`` attrs are not + overwritten during the copy. This can lead to speed-ups up to 100x + for short tables that have hundreds of columns. Closes #304. + +- For external links, its relative paths are resolved now with respect + to the directory of the main HDF5 file, rather than with respect to + the current directory. Closes #306. + +- ``Expr.setInputsRange()`` and ``Expr.setOutputRange()`` do support + ``numpy.integer`` types now. Closes #285. + +- Column names in tables can start with '__' now. Closes #291. + +- Unicode empty strings are supported now as attributes. Addresses #307. + +- Cython 0.13 and higher is supported now. Fixes #293. + +- PyTables should be more 'easy_install'-able now. Addresses #298. + + +Changes from 2.2rc2 to 2.2 (final) +================================== + +- Updated Blosc to 1.0 (final). + +- Filter ID of Blosc changed from wrong 32010 to reserved 32001. This + will prevent PyTables 2.2 (final) to read files created with Blosc and + PyTables 2.2 pre-final. `ptrepack` can be used to retrieve those + files, if necessary. More info in ticket #281. + +- Recent benchmarks suggest a new parametrization is better in most + scenarios: + + * The default chunksize has been doubled for every dataset size. This + works better in most of scenarios, specially with the new Blosc + compressor. + + * The HDF5 CHUNK_CACHE_SIZE parameter has been raised to 2 MB in order + to better adapt to the chunksize increase. This provides better hit + ratio (at the cost of consuming more memory). + + Some plots have been added to the User's Manual (chapter 5) showing + how the new parametrization works. + + +Changes from 2.2rc1 to 2.2rc2 +============================= + +- A new version of Blosc (0.9.5) is included. This version is now + considered to be stable and apt for production. Thanks for all + PyTables users that have contributed to find and report bugs. + +- Added a new `IO_BUFFER_SIZE` parameter to ``tables/parameters.py`` + that allows to set the internal PyTables' buffer for doing I/O. This + replaces `CHUNKTIMES` but it is more general because it affects to all + `Leaf` objects and also the `tables.Expr` module (and not only tables + as before). + +- `BUFFERTIMES` parameter in ``tables/parameters.py`` has been + renamed to `BUFFER_TIMES` which is more consistent with other + parameter names. + +- On Windows platforms, the path to the tables module is now appended to + sys.path and the PATH environment variable. That way DLLs and PYDs in + the tables directory are to be found now. Thanks to Christoph Gohlke + for the hint. + +- A replacement for barriers for Mac OSX, or other systems not + implementing them, has been carried out. This allows to compile + PyTables on such platforms. Fixes #278 + +- Fixed a couple of warts that raise compatibility warnings with + forthcoming Python 2.7. + +- HDF5 1.8.5 is used in Windows binaries. + +Changes from 2.2b3 to 2.2rc1 +============================ + +- Numexpr is not included anymore in PyTables and has become a requisite + instead. This is because Numexpr already has decent enough installers + and is available in the PyPI repository also, so it should be easy for + users to fulfill this dependency. + +- When using a Numexpr package that is turbo-loaded with Intel's + VML/MKL, the parameter `MAX_THREADS` will control the number of + threads that VML can use during computations. For a finer control, + the `numexpr.set_vml_num_threads()` can always be used. + +- Cython is used now instead of Pyrex for Pyrex extensions. + +- Updated to 0.9 version of Blosc compressor. This version can make use + of threads so as to accelerate the compression/decompression process. + In order to change the maximum number of threads that Blosc can use (2 + by default), you can modify the `MAX_THREADS` variable in + ``tables/parameters.py`` or make use of the new `setBloscMaxThreads()` + global function. + +- Reopening already opened files is supported now, provided that there is + not incompatibility among intended usages (for example, you cannot + reopen in append mode an already opened file in read-only mode). + +- Option ``--print-versions`` for ``test_all.py`` script is now + preferred over the deprecated ``--show-versions``. This is more + consistent with the existing `print_versions()` function. + +- Fixed a bug that, under some circumstances, prevented the use of table + iterators in `itertool.groupby()`. Now, you can safely do things + like:: + + sel_rows = table.where('(row_id >= 3)') + for group_id, grouped_rows in itertools.groupby(sel_rows, f_group): + group_mean = average([row['row_id'] for row in grouped_rows]) + + Fixes #264. + +- Copies of `Array` objects with multidimensional atoms (coming from + native HDF5 files) work correctly now (i.e. the copy holds the atom + dimensionality). Fixes #275. + +- The `tables.openFile()` function does not try anymore to open/close + the file in order to guess whether it is a HDF5 or PyTables one before + opening it definitely. This allows the `fcntl.flock()` and + `fcntl.lockf()` Python functions to work correctly now (that's useful + for arbitrating access to the file by different processes). Thanks to + Dag Sverre Seljebotn and Ivan Vilata for their suggestions on hunting + this one! Fixes #185. + +- The estimation of the chunksize when using multidimensional atoms in + EArray/Carray was wrong because it did not take in account the shape + of the atom. Thanks to Ralf Juengling for reporting. Fixes #273. + +- Non-contiguous arrays can now safely be saved as attributes. Before, + if arrays were not contiguous, incorrect data was saved in attr. + Fixes #270. + +- EXTDIM attribute for CArray/EArray now saves the correct extendable + dimension, instead of rubbish. This does not affected functionality, + because extendable dimension was retrieved directly from shape + information, but it was providing misleading information to the user. + Fixes #268. + +API changes +----------- + +- Now, `Table.Cols.__len__()` returns the number of top level columns + instead of the number of rows in table. This is more consistent in + that `Table.Cols` is an accessor for *columns*. Fixes #276. + + +Changes from 2.2b2 to 2.2b3 +=========================== + +- Blosc compressor has been added as an additional filter, in addition + to the existing Zlib, LZO and bzip2. This new compressor is meant for + fast compression and extremely fast decompression. Fixes #265. + +- In `File.copyFile()` method, `copyuserattrs` was set to false as + default. This was inconsistent with other methods where the default + value for `copyuserattrs` is true. The default for this is true now. + Closes #261. + +- `tables.copyFile` and `File.copyFile` recognize now the parameters + present in ``tables/parameters.py``. Fixes #262. + +- Backported fix for issue #25 in Numexpr (OP_NEG_LL treats the argument + as an int, not a long long). Thanks to David Cooke for this. + +- CHUNK_CACHE_NELMTS in `tables/parameters.py` set to a prime number as + Neil Fortner suggested. + +- Workaround for a problem in Python 2.6.4 (and probably other versions + too) for pickling strings like "0" or "0.". Fixes #253. + + +Changes from 2.2b1 to 2.2b2 +=========================== + +Enhancements +------------ + +- Support for HDF5 hard links, soft links and external links (when + PyTables is compiled against HDF5 1.8.x series). A new tutorial about + its usage has been added to the 'Tutorials' chapter of User's Manual. + Closes #239 and #247. + +- Added support for setting HDF5 chunk cache parameters in file + opening/creating time. 'CHUNK_CACHE_NELMTS', 'CHUNK_CACHE_PREEMPT' + and 'CHUNK_CACHE_SIZE' are the new parameters. See "PyTables' + parameter files" appendix in User's Manual for more info. Closes + #221. + +- New `Unknown` class added so that objects that HDF5 identifies as + ``H5G_UNKNOWN`` can be mapped to it and continue operations + gracefully. + +- Optimization in the indexed queries when the resulting rows increase + monotonically. From 3x (for medium-size query results) and 10x (for very + large query results) speed-ups can be expected. + +- Added flag `--dont-create-sysattrs` to ``ptrepack`` so as to not + create sys attrs (default is to do it). + +- Support for native compound types in attributes. This allows for + better compatibility with HDF5 files. Closes #208. + +- Support for native NumPy dtype in the description parameter of + `File.createTable()`. Closes #238. + + +Bugs fixed +---------- + +- Added missing `_c_classId` attribute to the `UnImplemented` class. + ``ptrepack`` no longer chokes while copying `Unimplemented` classes. + +- The ``FIELD_*`` sys attrs are no longer copied when the + ``PYTABLES_SYS_ATTRS`` parameter is set to false. + +- `File.createTable()` no longer segfaults if description=None. Closes + #248. + +- Workaround for avoiding a Python issue causing a segfault when saving + and then retrieving a string attribute with values "0" or "0.". + Closes #253. + + +API changes +----------- + +- `Row.__contains__()` disabled because it has little sense to query for + a key in Row, and the correct way should be to query for it in + `Table.colnames` or `Table.colpathnames` better. Closes #241. + +- [Semantic change] To avoid a common pitfall when asking for the string + representation of a `Row` class, `Row.__str__()` has been redefined. + Now, it prints something like:: + + >>> for row in table: + ... print row + ... + /newgroup/table.row (Row), pointing to row #0 + /newgroup/table.row (Row), pointing to row #1 + /newgroup/table.row (Row), pointing to row #2 + + instead of:: + + >>> for row in table: + ... print row + ... + ('Particle: 0', 0, 10, 0.0, 0.0) + ('Particle: 1', 1, 9, 1.0, 1.0) + ('Particle: 2', 2, 8, 4.0, 4.0) + + Use `print row[:]` idiom if you want to reproduce the old behaviour. + Closes #252. + + +Other changes +------------- + +- After some improvements in both HDF5 and PyTables, the limit before + emitting a `PerformanceWarning` on the number of children in a group + has been raised from 4096 to 16384. + + +Changes from 2.1.1 to 2.2b1 +=========================== + +Enhancements +------------ + +- Added `Expr`, a class for evaluating expressions containing + array-like objects. It can evaluate expressions (like '3*a+4*b') + that operate on arbitrary large arrays while optimizing the + resources (basically main memory and CPU cache memory) required to + perform them. It is similar to the Numexpr package, but in addition + to NumPy objects, it also accepts disk-based homogeneous arrays, + like the `Array`, `CArray`, `EArray` and `Column` PyTables objects. + +- Added support for NumPy's extended slicing in all `Leaf` objects. + With that, you can do the next sort of selections:: + + array1 = array[4] # simple selection + array2 = array[4:1000:2] # slice selection + array3 = array[1, ..., ::2, 1:4, 4:] # general slice selection + array4 = array[1, [1,5,10], ..., -1] # fancy selection + array5 = array[np.where(array[:] > 4)] # point selection + array6 = array[array[:] > 4] # boolean selection + + Thanks to Andrew Collette for implementing this for h5py, from which + it has been backported. Closes #198 and #209. + +- Numexpr updated to 1.3.1. This can lead to up a 25% improvement of + the time for both in-kernel and indexed queries for unaligned + tables. + +- HDF5 1.8.3 supported. + + +Bugs fixed +---------- + +- Fixed problems when modifying multidimensional columns in Table + objects. Closes #228. + +- Row attribute is no longer stalled after a table move or rename. + Fixes #224. + +- Array.__getitem__(scalar) returns a NumPy scalar now, instead of a + 0-dim NumPy array. This should not be noticed by normal users, + unless they check for the type of returned value. Fixes #222. + + +API changes +----------- + +- Added a `dtype` attribute for all leaves. This is the NumPy + ``dtype`` that most closely matches the leaf type. This allows for + a quick-and-dirty check of leaf types. Closes #230. + +- Added a `shape` attribute for `Column` objects. This is formed by + concatenating the length of the column and the shape of its type. + Also, the representation of columns has changed an now includes the + length of the column as the leading dimension. Closes #231. + +- Added a new `maindim` attribute for `Column` which has the 0 value + (the leading dimension). This allows for a better similarity with + other \*Array objects. + +- In order to be consistent and allow the extended slicing to happen + in `VLArray` objects too, `VLArray.__setitem__()` is not able to + partially modify rows based on the second dimension passed as key. + If this is tried, an `IndexError` is raised now. Closes #210. + +- The `forceCSI` flag has been replaced by `checkCSI` in the next + `Table` methods: `copy()`, `readSorted()` and `itersorted()`. The + change reflects the fact that a re-index operation cannot be + triggered from these methods anymore. The rational for the change + is that an indexing operation is a potentially very expensive + operation that should be carried out explicitly instead of being + triggered by methods that should not be in charge of this task. + Closes #216. + + +Backward incompatible changes +----------------------------- + +- After the introduction of the `shape` attribute for `Column` + objects, the shape information for multidimensional columns has been + removed from the `dtype` attribute (it is set to the base type of + the column now). Closes #232. + + + **Enjoy data!** + + -- The PyTables Team + + +.. Local Variables: +.. mode: rst +.. coding: utf-8 +.. fill-column: 72 +.. End: diff --git a/doc/source/release-notes/RELEASE_NOTES_v2.2.x.rst b/doc/source/release-notes/RELEASE_NOTES_v2.2.x.rst new file mode 100644 index 0000000..e2a40c8 --- /dev/null +++ b/doc/source/release-notes/RELEASE_NOTES_v2.2.x.rst @@ -0,0 +1,412 @@ +======================================= + Release notes for PyTables 2.2 series +======================================= + +:Author: Francesc Alted i Abad +:Contact: faltet@pytables.org + + +Changes from 2.2.1rc1 to 2.2.1 +============================== + +- The `Row` accessor implements a new `__contains__` special method that + allows doing things like:: + + for row in table: + if item in row: + print "Value found in row", row.nrow + break + + Closes #309. + +- PyTables is more friendly with easy_install and pip now, as all the + Python dependencies should be installed automatically. Closes #298. + + +Changes from 2.2 to 2.2.1rc1 +============================ + +- When using `ObjectAtom` objects in `VLArrays` the ``HIGHEST_PROTOCOL`` + is used for pickling objects. For NumPy arrays, this simple change + leads to space savings up to 3x and time improvements up to 30x. + Closes #301. + +- tables.Expr can perform operations on scalars now. Thanks to Gaëtan + de Menten for providing a patch for this. Closes #287. + +- Fixed a problem with indexes larger than 32-bit on leaf objects on + 32-bit machines. Fixes #283. + +- Merged in Blosc 1.1.2 for fixing a problem with large datatypes and + subprocess issues. Closes #288 and #295. + +- Due to the adoption of Blosc 1.1.2, the pthreads-win32 library + dependency is dropped on Windows platforms. + +- Fixed a problem with tables.Expr and operands with vary large + rowsizes. Closes #300. + +- ``leaf[numpy.array[scalar]]`` idiom returns a NumPy array instead of + an scalar. This has been done for compatibility with NumPy. Closes + #303. + +- Optimization for `Table.copy()` so that ``FIELD_*`` attrs are not + overwritten during the copy. This can lead to speed-ups up to 100x + for short tables that have hundreds of columns. Closes #304. + +- For external links, its relative paths are resolved now with respect + to the directory of the main HDF5 file, rather than with respect to + the current directory. Closes #306. + +- ``Expr.setInputsRange()`` and ``Expr.setOutputRange()`` do support + ``numpy.integer`` types now. Closes #285. + +- Column names in tables can start with '__' now. Closes #291. + +- Unicode empty strings are supported now as attributes. Addresses #307. + +- Cython 0.13 and higher is supported now. Fixes #293. + +- PyTables should be more 'easy_install'-able now. Addresses #298. + + +Changes from 2.2rc2 to 2.2 (final) +================================== + +- Updated Blosc to 1.0 (final). + +- Filter ID of Blosc changed from wrong 32010 to reserved 32001. This + will prevent PyTables 2.2 (final) to read files created with Blosc and + PyTables 2.2 pre-final. `ptrepack` can be used to retrieve those + files, if necessary. More info in ticket #281. + +- Recent benchmarks suggest a new parametrization is better in most + scenarios: + + * The default chunksize has been doubled for every dataset size. This + works better in most of scenarios, specially with the new Blosc + compressor. + + * The HDF5 CHUNK_CACHE_SIZE parameter has been raised to 2 MB in order + to better adapt to the chunksize increase. This provides better hit + ratio (at the cost of consuming more memory). + + Some plots have been added to the User's Manual (chapter 5) showing + how the new parametrization works. + + +Changes from 2.2rc1 to 2.2rc2 +============================= + +- A new version of Blosc (0.9.5) is included. This version is now + considered to be stable and apt for production. Thanks for all + PyTables users that have contributed to find and report bugs. + +- Added a new `IO_BUFFER_SIZE` parameter to ``tables/parameters.py`` + that allows to set the internal PyTables' buffer for doing I/O. This + replaces `CHUNKTIMES` but it is more general because it affects to all + `Leaf` objects and also the `tables.Expr` module (and not only tables + as before). + +- `BUFFERTIMES` parameter in ``tables/parameters.py`` has been + renamed to `BUFFER_TIMES` which is more consistent with other + parameter names. + +- On Windows platforms, the path to the tables module is now appended to + sys.path and the PATH environment variable. That way DLLs and PYDs in + the tables directory are to be found now. Thanks to Christoph Gohlke + for the hint. + +- A replacement for barriers for Mac OSX, or other systems not + implementing them, has been carried out. This allows to compile + PyTables on such platforms. Fixes #278 + +- Fixed a couple of warts that raise compatibility warnings with + forthcoming Python 2.7. + +- HDF5 1.8.5 is used in Windows binaries. + +Changes from 2.2b3 to 2.2rc1 +============================ + +- Numexpr is not included anymore in PyTables and has become a requisite + instead. This is because Numexpr already has decent enough installers + and is available in the PyPI repository also, so it should be easy for + users to fulfill this dependency. + +- When using a Numexpr package that is turbo-loaded with Intel's + VML/MKL, the parameter `MAX_THREADS` will control the number of + threads that VML can use during computations. For a finer control, + the `numexpr.set_vml_num_threads()` can always be used. + +- Cython is used now instead of Pyrex for Pyrex extensions. + +- Updated to 0.9 version of Blosc compressor. This version can make use + of threads so as to accelerate the compression/decompression process. + In order to change the maximum number of threads that Blosc can use (2 + by default), you can modify the `MAX_THREADS` variable in + ``tables/parameters.py`` or make use of the new `setBloscMaxThreads()` + global function. + +- Reopening already opened files is supported now, provided that there is + not incompatibility among intended usages (for example, you cannot + reopen in append mode an already opened file in read-only mode). + +- Option ``--print-versions`` for ``test_all.py`` script is now + preferred over the deprecated ``--show-versions``. This is more + consistent with the existing `print_versions()` function. + +- Fixed a bug that, under some circumstances, prevented the use of table + iterators in `itertool.groupby()`. Now, you can safely do things + like:: + + sel_rows = table.where('(row_id >= 3)') + for group_id, grouped_rows in itertools.groupby(sel_rows, f_group): + group_mean = average([row['row_id'] for row in grouped_rows]) + + Fixes #264. + +- Copies of `Array` objects with multidimensional atoms (coming from + native HDF5 files) work correctly now (i.e. the copy holds the atom + dimensionality). Fixes #275. + +- The `tables.openFile()` function does not try anymore to open/close + the file in order to guess whether it is a HDF5 or PyTables one before + opening it definitely. This allows the `fcntl.flock()` and + `fcntl.lockf()` Python functions to work correctly now (that's useful + for arbitrating access to the file by different processes). Thanks to + Dag Sverre Seljebotn and Ivan Vilata for their suggestions on hunting + this one! Fixes #185. + +- The estimation of the chunksize when using multidimensional atoms in + EArray/Carray was wrong because it did not take in account the shape + of the atom. Thanks to Ralf Juengling for reporting. Fixes #273. + +- Non-contiguous arrays can now safely be saved as attributes. Before, + if arrays were not contiguous, incorrect data was saved in attr. + Fixes #270. + +- EXTDIM attribute for CArray/EArray now saves the correct extendable + dimension, instead of rubbish. This does not affected functionality, + because extendable dimension was retrieved directly from shape + information, but it was providing misleading information to the user. + Fixes #268. + +API changes +----------- + +- Now, `Table.Cols.__len__()` returns the number of top level columns + instead of the number of rows in table. This is more consistent in + that `Table.Cols` is an accessor for *columns*. Fixes #276. + + +Changes from 2.2b2 to 2.2b3 +=========================== + +- Blosc compressor has been added as an additional filter, in addition + to the existing Zlib, LZO and bzip2. This new compressor is meant for + fast compression and extremely fast decompression. Fixes #265. + +- In `File.copyFile()` method, `copyuserattrs` was set to false as + default. This was inconsistent with other methods where the default + value for `copyuserattrs` is true. The default for this is true now. + Closes #261. + +- `tables.copyFile` and `File.copyFile` recognize now the parameters + present in ``tables/parameters.py``. Fixes #262. + +- Backported fix for issue #25 in Numexpr (OP_NEG_LL treats the argument + as an int, not a long long). Thanks to David Cooke for this. + +- CHUNK_CACHE_NELMTS in `tables/parameters.py` set to a prime number as + Neil Fortner suggested. + +- Workaround for a problem in Python 2.6.4 (and probably other versions + too) for pickling strings like "0" or "0.". Fixes #253. + + +Changes from 2.2b1 to 2.2b2 +=========================== + +Enhancements +------------ + +- Support for HDF5 hard links, soft links and external links (when + PyTables is compiled against HDF5 1.8.x series). A new tutorial about + its usage has been added to the 'Tutorials' chapter of User's Manual. + Closes #239 and #247. + +- Added support for setting HDF5 chunk cache parameters in file + opening/creating time. 'CHUNK_CACHE_NELMTS', 'CHUNK_CACHE_PREEMPT' + and 'CHUNK_CACHE_SIZE' are the new parameters. See "PyTables' + parameter files" appendix in User's Manual for more info. Closes + #221. + +- New `Unknown` class added so that objects that HDF5 identifies as + ``H5G_UNKNOWN`` can be mapped to it and continue operations + gracefully. + +- Added flag `--dont-create-sysattrs` to ``ptrepack`` so as to not + create sys attrs (default is to do it). + +- Support for native compound types in attributes. This allows for + better compatibility with HDF5 files. Closes #208. + +- Support for native NumPy dtype in the description parameter of + `File.createTable()`. Closes #238. + + +Bugs fixed +---------- + +- Added missing `_c_classId` attribute to the `UnImplemented` class. + ``ptrepack`` no longer chokes while copying `Unimplemented` classes. + +- The ``FIELD_*`` sys attrs are no longer copied when the + ``PYTABLES_SYS_ATTRS`` parameter is set to false. + +- `File.createTable()` no longer segfaults if description=None. Closes + #248. + +- Workaround for avoiding a Python issue causing a segfault when saving + and then retrieving a string attribute with values "0" or "0.". + Closes #253. + + +API changes +----------- + +- `Row.__contains__()` disabled because it has little sense to query for + a key in Row, and the correct way should be to query for it in + `Table.colnames` or `Table.colpathnames` better. Closes #241. + +- [Semantic change] To avoid a common pitfall when asking for the string + representation of a `Row` class, `Row.__str__()` has been redefined. + Now, it prints something like:: + + >>> for row in table: + ... print row + ... + /newgroup/table.row (Row), pointing to row #0 + /newgroup/table.row (Row), pointing to row #1 + /newgroup/table.row (Row), pointing to row #2 + + instead of:: + + >>> for row in table: + ... print row + ... + ('Particle: 0', 0, 10, 0.0, 0.0) + ('Particle: 1', 1, 9, 1.0, 1.0) + ('Particle: 2', 2, 8, 4.0, 4.0) + + Use `print row[:]` idiom if you want to reproduce the old behaviour. + Closes #252. + + +Other changes +------------- + +- After some improvements in both HDF5 and PyTables, the limit before + emitting a `PerformanceWarning` on the number of children in a group + has been raised from 4096 to 16384. + + +Changes from 2.1.1 to 2.2b1 +=========================== + +Enhancements +------------ + +- Added `Expr`, a class for evaluating expressions containing + array-like objects. It can evaluate expressions (like '3*a+4*b') + that operate on arbitrary large arrays while optimizing the + resources (basically main memory and CPU cache memory) required to + perform them. It is similar to the Numexpr package, but in addition + to NumPy objects, it also accepts disk-based homogeneous arrays, + like the `Array`, `CArray`, `EArray` and `Column` PyTables objects. + +- Added support for NumPy's extended slicing in all `Leaf` objects. + With that, you can do the next sort of selections:: + + array1 = array[4] # simple selection + array2 = array[4:1000:2] # slice selection + array3 = array[1, ..., ::2, 1:4, 4:] # general slice selection + array4 = array[1, [1,5,10], ..., -1] # fancy selection + array5 = array[np.where(array[:] > 4)] # point selection + array6 = array[array[:] > 4] # boolean selection + + Thanks to Andrew Collette for implementing this for h5py, from which + it has been backported. Closes #198 and #209. + +- Numexpr updated to 1.3.1. This can lead to up a 25% improvement of + the time for both in-kernel and indexed queries for unaligned + tables. + +- HDF5 1.8.3 supported. + + +Bugs fixed +---------- + +- Fixed problems when modifying multidimensional columns in Table + objects. Closes #228. + +- Row attribute is no longer stalled after a table move or rename. + Fixes #224. + +- Array.__getitem__(scalar) returns a NumPy scalar now, instead of a + 0-dim NumPy array. This should not be noticed by normal users, + unless they check for the type of returned value. Fixes #222. + + +API changes +----------- + +- Added a `dtype` attribute for all leaves. This is the NumPy + ``dtype`` that most closely matches the leaf type. This allows for + a quick-and-dirty check of leaf types. Closes #230. + +- Added a `shape` attribute for `Column` objects. This is formed by + concatenating the length of the column and the shape of its type. + Also, the representation of columns has changed an now includes the + length of the column as the leading dimension. Closes #231. + +- Added a new `maindim` attribute for `Column` which has the 0 value + (the leading dimension). This allows for a better similarity with + other \*Array objects. + +- In order to be consistent and allow the extended slicing to happen + in `VLArray` objects too, `VLArray.__setitem__()` is not able to + partially modify rows based on the second dimension passed as key. + If this is tried, an `IndexError` is raised now. Closes #210. + +- The `forceCSI` flag has been replaced by `checkCSI` in the next + `Table` methods: `copy()`, `readSorted()` and `itersorted()`. The + change reflects the fact that a re-index operation cannot be + triggered from these methods anymore. The rational for the change + is that an indexing operation is a potentially very expensive + operation that should be carried out explicitly instead of being + triggered by methods that should not be in charge of this task. + Closes #216. + + +Backward incompatible changes +----------------------------- + +- After the introduction of the `shape` attribute for `Column` + objects, the shape information for multidimensional columns has been + removed from the `dtype` attribute (it is set to the base type of + the column now). Closes #232. + + + **Enjoy data!** + + -- The PyTables Team + + +.. Local Variables: +.. mode: rst +.. coding: utf-8 +.. fill-column: 72 +.. End: diff --git a/doc/source/release-notes/RELEASE_NOTES_v2.3.x.rst b/doc/source/release-notes/RELEASE_NOTES_v2.3.x.rst new file mode 100644 index 0000000..ba66ccb --- /dev/null +++ b/doc/source/release-notes/RELEASE_NOTES_v2.3.x.rst @@ -0,0 +1,100 @@ +======================================= + Release notes for PyTables 2.3 series +======================================= + +:Author: PyTables maintainers +:Contact: pytables@googlemail.com + + +Changes from 2.3 to 2.3.1 +========================= + +- Fixed a bug that prevented to read scalar datasets of UnImplemented types + (closes :issue:`111`). Thanks to Kamil Kisiel. + +- Fixed a bug in `setup.py` that caused installation of PyTables 2.3 to fail + on hosts with multiple python versions installed (closes :issue:`113`). + Thanks to sbinet. + + +Changes from 2.2.1 to 2.3 +========================= + +Features coming from (now liberated) PyTables Pro +------------------------------------------------- + +- OPSI is a powerful and innovative indexing engine allowing PyTables to + perform fast queries on arbitrarily large tables. Moreover, it offers a wide + range of optimization levels for its indexes so that the user can choose the + best one that suits her needs (more or less size, more or less performance). + Indexation code also takes advantage of the vectorization capabilities of the + NumPy and Numexpr packages to ensure really short indexing and search times. + +- A fine-tuned LRU cache for both metadata (nodes) and regular data that lets + you achieve maximum speed for intensive object tree browsing during data + reads and queries. It complements the already efficient cache present in + HDF5, although this is more geared towards high-level structures that are + specific to PyTables and that are critical for achieving very high + performance. + +Other changes +------------- + +- Indexes with no elements are now evaluated as non-CSI ones. Closes + #312. + +- Numexpr presence is tested now in setup.py, provided that user is not + using setuptools (i.e. ``easy_install`` or ``pip`` tools). When using + setuptools, numexpr continues to be a requisite (and Cython too). + Closes #298. + +- Cython is enforced now during compilation time. Also, it is not + required when running tests. + +- Repeatedly closing a file that has been reopened several times is + supported now. Closes #318. + +- The number of times a file has been currently reopened is available + now in the new `File.open_count` read-only attribute. + +- The entire documentation set has been converted to sphinx (close + :issue:`85` and :issue:`86`) that now also has an index + (closes :issue`39`). + +- The entire test suite has been updated to use unittest specific + assertions (closes :issue:`66`). + +- PyTables has been tested against the latest version of numpy (v. 1.6.1 + and 2.0dev) and Cython (v, 0.15) packages. Closes :issue:`84`. + +- The setup.py script has been improved to better detect runtimes + (closes :issue:`73`). + +Deprecations +------------ + +Support for some old packages and related features has been deprecated +and will be removed in future versions: + +- Numeric (closes :issue:`76`) +- numarray (closes :issue`76` and :issue:`75`) +- HDF5 1.6.x (closes :issue`96`) + +At the API level the following are now deprecated: + +- the tables.is_pro constant is deprecated because PyTables Pro + has been released under an open source license. +- the netcdf3 sub-package (closes :issue:`67`) +- the nra sub-package + + + **Enjoy data!** + + -- The PyTables Team + + +.. Local Variables: +.. mode: rst +.. coding: utf-8 +.. fill-column: 72 +.. End: diff --git a/doc/source/release-notes/RELEASE_NOTES_v2.4.x.rst b/doc/source/release-notes/RELEASE_NOTES_v2.4.x.rst new file mode 100644 index 0000000..591300c --- /dev/null +++ b/doc/source/release-notes/RELEASE_NOTES_v2.4.x.rst @@ -0,0 +1,156 @@ +======================================= + Release notes for PyTables 2.4 series +======================================= + +:Author: PyTables maintainers +:Contact: pytables@googlemail.com + +.. py:currentmodule:: tables + + +Changes from 2.3.1 to 2.4 +========================= + +New features +------------ + +- Improved HDF5 error logging management: + + * added a new function, :func:`silenceHDF5Messages`, for suppressing + (and re-enabling) HDF5 messages. By default HDF5 error logging is now + suppressed. Closes :issue:`87`. + * now all HDF5 error messages and trace-backs are trapped and attached to + the :exc:`exceptions.HDF5ExtError` exception instances. + Closes :issue:`120`. + +- Added support for the float16 data type. It is only available if numpy_ + provides it as well (i.e. numpy_ >= 1.6). See :issue:`51`. + +- Leaf nodes now have attributes for retrieving the size of data in memory + and on disk. Data on disk can be compressed, so the new attributes make it + easy to compute the data compression ration. + Thanks to Josh Ayers (close :issue:`141`). + +- The maximum number of threads for Blosc_ and Numexpr_ is now handled using + the :data:`parameters.MAX_BLOSC_THREADS` and + :data:`parameters.MAX_NUMEXPR_THREADS` parameters respectively. + This allows a more fine grained configuration capability. + Closes :issue:`142`. + +- `ndim` (read-only) attribute added to :class:`Leaf`, :class:`Atom` and + :class:`Col` objects (closes :issue:`126`). + +- Added read support for variable length string attributes (non scalar + attributes are converted into numpy_ arrays with 'O8' type). + See :issue:`54`. + + +Other improvements +------------------ + +- Dropped support for HDF5 1.6.x. Now PyTables uses the HDF5 1.8 API + (closes :issue:`105`). + +- Blosc_ updated to v. 1.1.3. + +- The Blosc_ compression library is now automatically disabled on platforms + that do not support unaligned memory access (see also + https://github.com/FrancescAlted/blosc/issues/3 and + http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=661286). + +- Improved bzip2 detection on Windows (:issue:`116`). Thanks to cgohlke. + +- For Windows, the setup.py script now has the ability to automatically find + the HDF5_DIR in the system PATH. Thanks to Mark (mwiebe). + +- Improved multi-arch support in GNU/Linux platforms (closes :issue:`124`) + Thanks to Julian Taylor and Picca Frederic-Emmanuel. + +- Use new style syntax for exception raising. Closes :issue:`93`. + +- Fixed most of the warnings related to py3k compatibility (see :issue:`92`). + +- Fixed pyflakes_ warnings (closes :issue:`102`). + +- Cython_ extensions updated to use new constructs (closes :issue:`100`). + +- Reduced the number of build warnings (closes :issue:`101`). + +- Removed the old lrucache module. It is no more needed after the merge with + PyTables Pro (closes :issue:`118`). + +- Added explicit (import time) testing for hdf5dll.dll on Windows to improve + diagnostics (closes :issue:`146`). Thanks to Mark (mwiebe). + + +Documentation improvements +-------------------------- + +- new cookbook section (contents have been coming from the PyTables wiki + on http://www.pytables.org) + +- complete rework of the library reference. Now the entire chapter is + generated from docstrings using the sphinx autodoc extension. + A big thank you to Josh Ayers. Closes :issue:`148`. + +- new sphinx theme based on the cloud template + + +Bugs fixed +---------- + +- Fixed a segfault on platforms that do not support unaligned memory access + (closes: :issue:`134`). Thanks to Julian Taylor. + +- Fixed broken inheritance in :class:`IsDescription` classes (thanks to + Andrea Bedini). Closes :issue:`65`. + +- Fixed table descriptions copy method (closes :issue:`131`). + +- Fixed open failures handling (closes :issue:`158`). + Errors that happen when one tries to open an invalid HDF5 file (e.g. an + empty file) are now detected earlier by PyTables and a proper exception + (:exc:`exceptions.HDF5ExtError`) is raised. + Also, in case of open failures, invalid file descriptors are no more cached. + Before is fix it was not possible to completely close the bad file and reopen + the same path, even if a valid file was created in the meanwhile. + Thanks to Daniele for reporting and for the useful test code. + +- Fixed support to rich structured numpy.dtype in + :func:`description.descr_from_dtype`. Closes :issue:`160`. + +- Fixed sorting of nested tables that caused AttributeError. + Closes :issue:`156` and :issue:`157`. Thanks to Uwe Mayer. + +- Fixed flavor deregistration (closes :issue:`163`) + + +Deprecations +------------ + +- The :data:`parameters.MAX_THREADS` configuration parameter is now + deprecated. Please use :data:`parameters.MAX_BLOSC_THREADS` and + :data:`parameters.MAX_NUMEXPR_THREADS` instead. + See :issue:`142`. + +- Since the support for HDF5 1.6.x has been dropped, the *warn16incompat* + argument of the :meth:`File.createExternalLink` method and the + :exc:`exceptions.Incompat16Warning` exception class are now deprecated. + +.. _pyflakes: https://launchpad.net/pyflakes +.. _numpy: http://www.numpy.org +.. _Blosc: https://github.com/FrancescAlted/blosc +.. _Numexpr: http://code.google.com/p/numexpr +.. _Cython: http://www.cython.org + + + **Enjoy data!** + + -- The PyTables Team + + +.. Local Variables: +.. mode: rst +.. coding: utf-8 +.. fill-column: 72 +.. End: diff --git a/doc/source/release-notes/RELEASE_NOTES_v3.0.x.rst b/doc/source/release-notes/RELEASE_NOTES_v3.0.x.rst new file mode 100644 index 0000000..346cd46 --- /dev/null +++ b/doc/source/release-notes/RELEASE_NOTES_v3.0.x.rst @@ -0,0 +1,303 @@ +======================================= + Release notes for PyTables 3.0 series +======================================= + +:Author: PyTables Developers +:Contact: pytables@googlemail.com + +.. py:currentmodule:: tables + + +Changes from 2.4 to 3.0 +======================= + +New features +------------ + +- Since this release PyTables provides full support to Python_ 3 + (closes :issue:`188`). + +- The entire code base is now more compliant with coding style guidelines + describe in the PEP8_ (closes :issue:`103` and :issue:`224`). + See `API changes`_ for more details. + +- Basic support for HDF5 drivers. Now it is possible to open/create an + HDF5 file using one of the SEC2, DIRECT, LOG, WINDOWS, STDIO or CORE + drivers. Users can also set the main driver parameters (closes + :issue:`166`). + Thanks to Michal Slonina. + +- Basic support for in-memory image files. An HDF5 file can be set from or + copied into a memory buffer (thanks to Michal Slonina). This feature is + only available if PyTables is built against HDF5 1.8.9 or newer. + Closes :issue:`165` and :issue:`173`. + +- New :meth:`File.get_filesize` method for retrieving the HDF5 file size. + +- Implemented methods to get/set the user block size in a HDF5 file + (closes :issue:`123`) + +- Improved support for PyInstaller_. Now it is easier to pack frozen + applications that use the PyTables package (closes: :issue:`177`). + Thanks to Stuart Mentzer and Christoph Gohlke. + +- All read methods now have an optional *out* argument that allows to pass a + pre-allocated array to store data (closes :issue:`192`) + +- Added support for the floating point data types with extended precision + (Float96, Float128, Complex192 and Complex256). This feature is only + available if numpy_ provides it as well. + Closes :issue:`51` and :issue:`214`. Many thanks to Andrea Bedini. + +- Consistent ``create_xxx()`` signatures. Now it is possible to create all + data sets :class:`Array`, :class:`CArray`, :class:`EArray`, + :class:`VLArray`, and :class:`Table` from existing Python objects (closes + :issue:`61` and :issue:`249`). See also the `API changes`_ section. + +- Complete rewrite of the :mod:`nodes.filenode` module. Now it is fully + compliant with the interfaces defined in the standard :mod:`io` module. + Only non-buffered binary I/O is supported currently. + See also the `API changes`_ section. Closes :issue:`244`. + +- New :program:`pt2to3` tool is provided to help users to port their + applications to the new API (see `API changes`_ section). + + +Improvements +------------ + +- Improved runtime checks on dynamic loading of libraries: meaningful error + messages are generated in case of failure. + Also, now PyTables no more alters the system PATH. + Closes :issue:`178` and :issue:`179` (thanks to Christoph Gohlke). + +- Improved list of search paths for libraries as suggested by Nicholaus + Halecky (see :issue:`219`). + +- Removed deprecated Cython_ include (.pxi) files. Contents of + :file:`convtypetables.pxi` have been moved in :file:`utilsextension.pyx`. + Closes :issue:`217`. + +- The internal Blosc_ library has been upgraded to version 1.2.3. + +- Pre-load the bzip2_ library on windows (closes :issue:`205`) + +- The :meth:`File.get_node` method now accepts unicode paths + (closes :issue:`203`) + +- Improved compatibility with Cython_ 0.19 (see :issue:`220` and + :issue:`221`) + +- Improved compatibility with numexpr_ 2.1 (see also :issue:`199` and + :issue:`241`) + +- Improved compatibility with development versions of numpy_ + (see :issue:`193`) + +- Packaging: since this release the standard tar-ball package no more includes + the PDF version of the "PyTables User Guide", so it is a little bit smaller + now. The complete and pre-build version of the documentation both in HTML + and PDF format is available on the file `download area`_ on SourceForge.net. + Closes: :issue:`172`. + +- Now PyTables also uses `Travis-CI`_ as continuous integration service. + All branches and all pull requests are automatically tested with different + Python_ versions. Closes :issue:`212`. + + +Other changes +------------- + +- PyTables now requires Python 2.6 or newer. + +- Minimum supported version of Numexpr_ is now 2.0. + + +API changes +----------- + +The entire PyTables API as been made more PEP8_ compliant (see :issue:`224`). + +This means that many methods, attributes, module global variables and also +keyword parameters have been renamed to be compliant with PEP8_ style +guidelines (e.g. the ``tables.hdf5Version`` constant has been renamed into +``tables.hdf5_version``). + +We made the best effort to maintain compatibility to the old API for existing +applications. In most cases, the old 2.x API is still available and usable +even if it is now deprecated (see the Deprecations_ section). + +The only important backwards incompatible API changes are for names of +function/methods arguments. All uses of keyword arguments should be +checked and fixed to use the new naming convention. + +The new :program:`pt2to3` tool can be used to port PyTables based applications +to the new API. + +Many deprecated features and support for obsolete modules has been dropped: + +- The deprecated :data:`is_pro` module constant has been removed + +- The nra module and support for the obsolete numarray module has been removed. + The *numarray* flavor is no more supported as well (closes :issue:`107`). + +- Support for the obsolete Numeric module has been removed. + The *numeric* flavor is no longer available (closes :issue:`108`). + +- The tables.netcdf3 module has been removed (closes :issue:`68`). + +- The deprecated :exc:`exceptions.Incompat16Warning` exception has been + removed + +- The :meth:`File.create_external_link` method no longer has a keyword + parameter named *warn16incompat*. It was deprecated in PyTables 2.4. + +Moreover: + +- The :meth:`File.create_array`, :meth:`File.create_carray`, + :meth:`File.create_earray`, :meth:`File.create_vlarray`, and + :meth:`File.create_table` methods of the :class:`File` objects gained a + new (optional) keyword argument named ``obj``. It can be used to initialize + the newly created dataset with an existing Python object, though normally + these are numpy_ arrays. + + The *atom*/*descriptor* and *shape* parameters are now optional if the + *obj* argument is provided. + +- The :mod:`nodes.filenode` has been completely rewritten to be fully + compliant with the interfaces defined in the :mod:`io` module. + + The FileNode classes currently implemented are intended for binary I/O. + + Main changes: + + * the FileNode base class is no more available, + * the new version of :class:`nodes.filenode.ROFileNode` and + :class:`nodes.filenode.RAFileNode` objects no more expose the *offset* + attribute (the *seek* and *tell* methods can be used instead), + * the *lineSeparator* property is no more available and the ``\n`` + character is always used as line separator. + +- The `__version__` module constants has been removed from almost all the + modules (it was not used after the switch to Git). Of course the package + level constant (:data:`tables.__version__`) still remains. + Closes :issue:`112`. + +- The :func:`lrange` has been dropped in favor of xrange (:issue:`181`) + +- The :data:`parameters.MAX_THREADS` configuration parameter has been dropped + in favor of :data:`parameters.MAX_BLOSC_THREADS` and + :data:`parameters.MAX_NUMEXPR_THREADS` (closes :issue:`147`). + +- The :func:`conditions.compile_condition` function no more has a *copycols* + argument, it was no more necessary since Numexpr_ 1.3.1. + Closes :issue:`117`. + +- The *expectedsizeinMB* parameter of the :meth:`File.create_vlarray` and of + the :meth:`VLArrsy.__init__` methods has been replaced by *expectedrows*. + See also (:issue:`35`). + +- The :meth:`Table.whereAppend` method has been renamed into + :meth:`Table.append_where` (closes :issue:`248`). + +Please refer to the :doc:`../MIGRATING_TO_3.x` document for more details about +API changes and for some useful hint about the migration process from the 2.X +API to the new one. + + +Other possibly incompatible changes +----------------------------------- + +- All methods of the :class:`Table` class that take *start*, *stop* and + *step* parameters (including :meth:`Table.read`, :meth:`Table.where`, + :meth:`Table.iterrows`, etc) have been redesigned to have a consistent + behaviour. The meaning of the *start*, *stop* and *step* and their default + values now always work exactly like in the standard :class:`slice` objects. + Closes :issue:`44` and :issue:`255`. + +- Unicode attributes are not stored in the HDF5 file as pickled string. + They are now saved on the HDF5 file as UTF-8 encoded strings. + + Although this does not introduce any API breakage, files produced are + different (for unicode attributes) from the ones produced by earlier + versions of PyTables. + +- System attributes are now stored in the HDF5 file using the character set + that reflects the native string behaviour: ASCII for Python 2 and UTF8 for + Python 3. In any case, system attributes are represented as Python string. + +- The :meth:`iterrows` method of :class:`*Array` and :class:`Table` as well + as the :meth:`Table.itersorted` now behave like functions in the standard + :mod:`itertools` module. + If the *start* parameter is provided and *stop* is None then the + array/table is iterated from *start* to the last line. + In PyTables < 3.0 only one element was returned. + + +Deprecations +------------ + +- As described in `API changes`_, all functions, methods and attribute names + that was not compliant with the PEP8_ guidelines have been changed. + Old names are still available but they are deprecated. + +- The use of upper-case keyword arguments in the :func:`open_file` function + and the :class:`File` class initializer is now deprecated. All parameters + defined in the :file:`tables/parameters.py` module can still be passed as + keyword argument to the :func:`open_file` function just using a lower-case + version of the parameter name. + + +Bugs fixed +---------- + +- Better check access on closed files (closes :issue:`62`) + +- Fix for :meth:`File.renameNode` where in certain cases + :meth:`File._g_updateLocation` was wrongly called (closes :issue:`208`). + Thanks to Michka Popoff. + +- Fixed ptdump failure on data with nested columns (closes :issue:`213`). + Thanks to Alexander Ford. + +- Fixed an error in :func:`open_file` when *filename* is a :class:`numpy.str_` + (closes :issue:`204`) + +- Fixed :issue:`119`, :issue:`230` and :issue:`232`, where an index on + :class:`Time64Col` (only, :class:`Time32Col` was ok) hides the data on + selection from a Tables. Thanks to Jeff Reback. + +- Fixed ``tables.tests.test_nestedtypes.ColsTestCase.test_00a_repr`` test + method. Now the ``repr`` of cols on big-endian platforms is correctly + handled (closes :issue:`237`). + +- Fixes bug with completely sorted indexes where *nrowsinbuf* must be equal + to or greater than the *chunksize* (thanks to Thadeus Burgess). + Closes :issue:`206` and :issue:`238`. + +- Fixed an issue of the :meth:`Table.itersorted` with reverse iteration + (closes :issue:`252` and :issue:`253`). + + +.. _Python: http://www.python.org +.. _PEP8: http://www.python.org/dev/peps/pep-0008 +.. _PyInstaller: http://www.pyinstaller.org +.. _Blosc: https://github.com/FrancescAlted/blosc +.. _bzip2: http://www.bzip.org +.. _Cython: http://www.cython.org +.. _Numexpr: http://code.google.com/p/numexpr +.. _numpy: http://www.numpy.org +.. _`download area`: http://sourceforge.net/projects/pytables/files/pytables +.. _`Travis-CI`: https://travis-ci.org + + + **Enjoy data!** + + -- The PyTables Developers + + +.. Local Variables: +.. mode: rst +.. coding: utf-8 +.. fill-column: 72 +.. End: diff --git a/doc/source/release-notes/RELEASE_NOTES_v3.1.x.rst b/doc/source/release-notes/RELEASE_NOTES_v3.1.x.rst new file mode 100644 index 0000000..6b0329e --- /dev/null +++ b/doc/source/release-notes/RELEASE_NOTES_v3.1.x.rst @@ -0,0 +1,221 @@ +Changes from 3.1.0 to 3.1.1 +=========================== + +Bugs fixed +---------- + +- Fixed a critical bug that caused an exception at import time. + The error was triggered when a bug in long-double detection is detected + in the HDF5 library (see :issue:`275`) and numpy_ does not expose + `float96` or `float128`. Closes :issue:`344`. +- The internal Blosc_ library has been updated to version 1.3.5. + This fixes a false buffer overrun condition that made c-blosc to fail, + even if the problem was not real. + + +Improvements +------------ + +- Do not create a temporary array when the *obj* parameter is not specified + in :meth:`File.create_array` (thanks to Francesc). + Closes :issue:`337` and :issue:`339`). +- Added two new utility functions + (:func:`tables.nodes.filenode.read_from_filenode` and + :func:`tables.nodes.filenode.save_to_filenode`) for the direct copy from + filesystem to filenode and vice versa (closes :issue:`342`). + Thanks to Andreas Hilboll. +- Removed the :file:`examples/nested-iter.py` considered no longer useful. + Closes :issue:`343`. +- Better detection of the `-msse2` compiler flag. + + +Changes from 3.0 to 3.1.0 +========================= + +New features +------------ + +- Now PyTables is able to save/restore the default value of :class:`EnumAtom` + types (closes :issue:`234`). +- Implemented support for the H5FD_SPLIT driver (closes :issue:`288`, + :issue:`289` and :issue:`295`). Many thanks to simleo. +- New quantization filter: the filter truncates floating point data to a + specified precision before writing to disk. This can significantly improve + the performance of compressors (closes :issue:`261`). + Thanks to Andreas Hilboll. +- Added new :meth:`VLArray.get_row_size` method to :class:`VLArray` for + querying the number of atoms of a :class:`VLArray` row. + Closes :issue:`24` and :issue:`315`. +- The internal Blosc_ library has been updated to version 1.3.2. + All new features introduced in the Blosc_ 1.3.x series, and in particular + the ability to leverage different compressors within Blosc_ (see the `Blosc + Release Notes`_), are now available in PyTables via the blosc filter + (closes: :issue:`324`). A big thank you to Francesc. + + +Improvements +------------ + +- The node caching mechanism has been completely redesigned to be simpler and + less dependent from specific behaviours of the ``__del__`` method. + Now PyTables is compatible with the forthcoming Python 3.4. + Closes :issue:`306`. +- PyTables no longer uses shared/cached file handlers. This change somewhat + improves support for concurrent reading allowing the user to safely open the + same file in different threads for reading (requires HDF5 >= 1.8.7). + More details about this change can be found in the `Backward incompatible + changes`_ section. + See also :issue:`130`, :issue:`129` :issue:`292` and :issue:`216`. +- PyTables is now able to detect and use external installations of the Blosc_ + library (closes :issue:`104`). If Blosc_ is not found in the system, and the + user do not specify a custom installation directory, then it is used an internal + copy of the Blosc_ source code. +- Automatically disable extended float support if a buggy version of HDF5 + is detected (see also `Issues with H5T_NATIVE_LDOUBLE`_). + See also :issue:`275`, :issue:`290` and :issue:`300`. +- Documented an unexpected behaviour with string literals in query conditions + on Python 3 (closes :issue:`265`) +- The deprecated :mod:`getopt` module has been dropped in favour of + :mod:`argparse` in all command line utilities (close :issue:`251`) +- Improved the installation section of the :doc:`../usersguide/index`. + + * instructions for installing PyTables via pip_ have been added. + * added a reference to the Anaconda_, Canopy_ and `Christoph Gohlke suites`_ + (closes :issue:`291`) + +- Enabled `Travis-CI`_ builds for Python_ 3.3 +- :meth:`Tables.read_coordinates` now also works with boolean indices input. + Closes :issue:`287` and :issue:`298`. +- Improved compatibility with numpy_ >= 1.8 (see :issue:`259`) +- The code of the benchmark programs (bench directory) has been updated. + Closes :issue:`114`. +- Fixed some warning related to non-unicode file names (the Windows bytes API + has been deprecated in Python 3.4) + + +Bugs fixed +---------- + +- Fixed detection of platforms supporting Blosc_ +- Fixed a crash that occurred when one attempts to write a numpy_ array to + an :class:`Atom` (closes :issue:`209` and :issue:`296`) +- Prevent creation of a table with no columns (closes :issue:`18` and + :issue:`299`) +- Fixed a memory leak that occured when iterating over + :class:`CArray`/:class:`EArray` objects (closes :issue:`308`, + see also :issue:`309`). + Many thanks to Alistair Muldal. +- Make NaN types sort to the end. Closes :issue:`282` and :issue:`313` +- Fixed selection on float columns when NaNs are present (closes :issue:`327` + and :issue:`330`) +- Fix computation of the buffer size for iterations on rows. + The buffers size was overestimated resulting in a :exc:`MemoryError` + in some cases. + Closes :issue:`316`. Thamks to bbudescu. +- Better check of file open mode. Closes :issue:`318`. +- The Blosc filter now works correctly together with fletcher32. + Closes :issue:`21`. +- Close the file handle before trying to delete the corresponding file. + Fixes a test failure on Windows. +- Use integer division for computing indices (fixes some warning on Windows) + + +Deprecations +------------ + +Following the plan for the complete transition to the new (PEP8_ compliant) +API, all calls to the old API will raise a :exc:`DeprecationWarning`. + +The new API has been introduced in PyTables 3.0 and is backward incompatible. +In order to guarantee a smoother transition the old API is still usable even +if it is now deprecated. + +The plan for the complete transition to the new API is outlined in +:issue:`224`. + + +Backward incompatible changes +----------------------------- + +In PyTables <= 3.0 file handles (objects that are returned by the +:func:`open_file` function) were stored in an internal registry and re-used +when possible. + +Two subsequent attempts to open the same file (with compatible open mode) +returned the same file handle in PyTables <= 3.0:: + + In [1]: import tables + In [2]: print(tables.__version__) + 3.0.0 + In [3]: a = tables.open_file('test.h5', 'a') + In [4]: b = tables.open_file('test.h5', 'a') + In [5]: a is b + Out[5]: True + +All this is an implementation detail, it happened under the hood and the user +had no control over the process. + +This kind of behaviour was considered a feature since it can speed up opening +of files in case of repeated opens and it also avoids any potential problem +related to multiple opens, a practice that the HDF5 developers recommend to +avoid (see also H5Fopen_ reference page). + +The trick, of course, is that files are not opened multiple times at HDF5 +level, rather an open file is referenced several times. + +The big drawback of this approach is that there are really few chances to use +PyTables safely in a multi thread program. Several bug reports have been +filed regarding this topic. + +After long discussions about the possibility to actually achieve concurrent I/O +and about patterns that should be used for the I/O in concurrent programs +PyTables developers decided to remove the *black magic under the hood* and +allow the users to implement the patterns they want. + +Starting from PyTables 3.1 file handles are no more re-used (*shared*) and +each call to the :func:`open_file` function returns a new file handle:: + + In [1]: import tables + In [2]: print tables.__version__ + 3.1.0 + In [3]: a = tables.open_file('test.h5', 'a') + In [4]: b = tables.open_file('test.h5', 'a') + In [5]: a is b + Out[5]: False + +It is important to stress that the new implementation still has an internal +registry (implementation detail) and it is still **not thread safe**. +Just now a smart enough developer should be able to use PyTables in a +muti-thread program without too much headaches. + +The new implementation behaves differently from the previous one, although the +API has not been changed. Now users should pay more attention when they open a +file multiple times (as recommended in the `HDF5 reference`__ ) and they +should take care of using them in an appropriate way. + +__ H5Fopen_ + +Please note that the :attr:`File.open_count` property was originally intended +to keep track of the number of references to the same file handle. +In PyTables >= 3.1, despite of the name, it maintains the same semantics, just +now its value should never be higher that 1. + +.. note:: + + HDF5 versions lower than 1.8.7 are not fully compatible with PyTables 3.1. + A partial support to HDF5 < 1.8.7 is still provided but in that case + multiple file opens are not allowed at all (even in read-only mode). + + +.. _pip: http://www.pip-installer.org +.. _Anaconda: https://store.continuum.io/cshop/anaconda +.. _Canopy: https://www.enthought.com/products/canopy +.. _`Christoph Gohlke suites`: http://www.lfd.uci.edu/~gohlke/pythonlibs +.. _`Issues with H5T_NATIVE_LDOUBLE`: https://forum.hdfgroup.org/t/issues-with-h5t-native-ldouble/2991 +.. _Python: http://www.python.org +.. _Blosc: http://www.blosc.org +.. _numpy: http://www.numpy.org +.. _`Travis-CI`: https://travis-ci.org +.. _PEP8: http://www.python.org/dev/peps/pep-0008 +.. _`Blosc Release Notes`: https://github.com/FrancescAlted/blosc/wiki/Release-notes +.. _H5Fopen: https://portal.hdfgroup.org/display/HDF5/Files diff --git a/doc/source/release-notes/RELEASE_NOTES_v3.2.x.rst b/doc/source/release-notes/RELEASE_NOTES_v3.2.x.rst new file mode 100644 index 0000000..89c10fb --- /dev/null +++ b/doc/source/release-notes/RELEASE_NOTES_v3.2.x.rst @@ -0,0 +1,279 @@ +======================================= + Release notes for PyTables 3.2 series +======================================= + +:Author: PyTables Developers +:Contact: pytables-dev@googlegroups.com + +.. py:currentmodule:: tables + + +Changes from 3.2.3 to 3.2.3.1 +============================= + +Fixed issues with pip install. + + +Changes from 3.2.2 to 3.2.3 +=========================== + +Improvements +------------ + +- It is now possible to use HDF5 with the new shared library naming scheme + (>= 1.8.10, hdf5.dll instead of hdf5dll.dll) on Windows (:issue:`540`). + Thanks to Tadeu Manoel. +- Now :program: `ptdump` sorts output by node name and does not print a + backtrace if file cannot be opened. + Thanks to Zbigniew Jędrzejewski-Szmek. + + +Bugs fixed +---------- + +- Only run `tables.tests.test_basics.UnicodeFilename` if the filesystem + encoding is utf-8. Closes :issue:`485`. +- Add lib64 to posix search path. (closes :issue:`507`) + Thanks to Mehdi Sadeghi. +- Ensure cache entries are removed if fewer than 10 (closes :issue:`529`). + Thanks to Graham Jones. +- Fix segmentation fault in a number of test cases that use + :class:`index.Index` (closes :issue:`532` and :issue:`533`). + Thanks to Diane Trout. +- Fixed the evaluation of transcendental functions when numexpr is + compiled with VML support (closes :issue:`534`, PR #536). + Thanks to Tom Kooij. +- Make sure that index classes use buffersizes that are a multiple + of chunkshape[0] (closes :issue:`538`, PR #538). + Thanks to Tom Kooij. +- Ensure benchmark paths exist before benchmarks are executed (PR #544). + Thanks to rohitjamuar. + +Other changes +------------- + +- Minimum Cython_ version is now v0.21 + + +.. _Cython: http://cython.org + + +Changes from 3.2.1.1 to 3.2.2 +============================= + +Bug fixed +--------- + +- Fix AssertionError in Row.__init_loop. See :issue:`477`. +- Fix issues with Cython 0.23. See :issue:`481`. +- Only run `tables.tests.test_basics.UnicodeFilename` if the filesystem + encoding is utf-8. Closes :issue:`485`. +- Fix missing PyErr_Clear. See :issue:`486`. +- Fix the C type of some numpy attributes. See :issue:`494`. +- Cast selection indices to integer. See :issue:`496`. +- Fix indexesextension._keysort_string. Closes :issue:`497` and :issue:`498`. + + +Changes from 3.2.1 to 3.2.1.1 +============================= + +- Fix permission on distributed source distribution + +Other changes +------------- + +- Minimum Cython_ version is now v0.21 + + +.. _Cython: http://cython.org + + +Changes from 3.2.0 to 3.2.1 +=========================== + +Bug fixed +--------- + +- Fix indexesextension._keysort. Fixes :issue:`455`. Thanks to Andrew Lin. + + +Changes from 3.1.1 to 3.2.0 +=========================== + +Improvements +------------ + +- The `nrowsinbuf` is better computed now for EArray/CArray having + a small `chunkshape` in the main dimension. Fixes #285. + +- PyTables should be installable very friendly via pip, including NumPy + being installed automatically in the unlikely case it is not yet + installed in the system. Thanks to Andrea Bedini. + +- setup.py has been largely simplified and now it requires *setuptools*. + Although we think this is a good step, please keep us informed this is + breaking some installation in a very bad manner. + +- setup.py now is able to used *pkg-config*, if available, to locate required + libraries (hdf5, bzip2, etc.). The use of *pkg-config* can be controlled + via setup.py command line flags or via environment variables. + Please refer to the installation guide (in the *User Manual*) for details. + Closes :issue:`442`. + +- It is now possible to create a new node whose parent is a softlink to another + group (see :issue:`422`). Thanks to Alistair Muldal. + +- :class:`link.SoftLink` objects no longer need to be explicitly dereferenced. + Methods and attributes of the linked object are now automatically accessed + when the user acts on a soft-link (see :issue:`399`). + Thanks to Alistair Muldal. + +- Now :program:`ptrepack` recognizes hardlinks and replicates them in the + output (*repacked*) file. This saves disk space and makes repacked files + more conformal to the original one. Closes :issue:`380`. + +- New :program:`pttree` script for printing HDF5 file contents as a pretty + ASCII tree (closes :issue:`400`). Thanks to Alistair Muldal. + +- The internal Blosc library has been downgraded to version 1.4.4. This + is in order to still allow using multiple threads *inside* Blosc, even + on multithreaded applications (see :issue:`411`, :issue:`412`, + :issue:`437` and :issue:`448`). + +- The :func:`print_versions` function now also reports the version of + compression libraries used by Blosc. + +- Now the :file:`setup.py` tries to use the '-march=native' C flag by + default. In falls back on '-msse2' if '-march=native' is not supported + by the compiler. Closes :issue:`379`. + +- Fixed a spurious unicode comparison warning (closes :issue:`372` and + :issue:`373`). + +- Improved handling of empty string attributes. In previous versions of + PyTables empty string were stored as scalar HDF5 attributes having size 1 + and value '\0' (an empty null terminated string). + Now empty string are stored as HDF5 attributes having zero size + +- Added a new cookbook recipe and a couple of examples for simple threading + with PyTables. + +- The redundant :func:`utilsextension.get_indices` function has been + eliminated (replaced by :meth:`slice.indices`). Closes :issue:`195`. + +- Allow negative indices in point selection (closes :issue:`360`) + +- Index wasn't being used if it claimed there were no results. + Closes :issue:`351` (see also :issue:`353`) + +- Atoms and Col types are no longer generated dynamically so now it is easier + for IDEs and static analysis tool to handle them (closes :issue:`345`) + +- The keysort functions in idx-opt.c have been cythonised using fused types. + The perfomance is mostly unchanged, but the code is much more simpler now. + Thanks to Andrea Bedini. + +- Small unit tests re-factoring: + + * :func:`print_versions` and :func:`tests.common.print_heavy` functions + moved to the :mod:`tests.common` module + + * always use :func:`print_versions` when test modules are called as scripts + + * use the unittest2_ package in Python 2.6.x + + * removed internal machinery used to replicate unittest2_ features + + * always use :class:`tests.common.PyTablesTestCase` as base class for all + test cases + + * code of the old :func:`tasts.common.cleanup` function has been moved to + :meth:`tests.common.PyTablesTestCase.tearDown` method + + * new implementation of :meth:`tests.common.PyTablesTestCase.assertWarns` + compatible with the one provided by the standard :mod:`unittest` module + in Python >= 3.2 + + * use :meth:`tests.common.PyTablesTestCase.assertWarns` as context manager + when appropriate + + * use the :func:`unittest.skipIf` decorator when appropriate + + * new :class:tests.comon.TestFileMixin: class + + +.. _unittest2: https://pypi.python.org/pypi/unittest2 + + +Bugs fixed +---------- + +- Fixed compatibility problems with numpy 1.9 and 1.10-dev + (closes :issue:`362` and :issue:`366`) + +- Fixed compatibility with Cython >= 0.20 (closes :issue:`386` and + :issue:`387`) + +- Fixed support for unicode node names in LRU cache (only Python 2 was + affected). Closes :issue:`367` and :issue:`369`. + +- Fixed support for unicode node titles (only Python 2 was affected). + Closes :issue:`370` and :issue:`374`. + +- Fixed a bug that caused the silent truncation of unicode attributes + containing the '\0' character. Closes :issue:`371`. + +- Fixed :func:`descr_from_dtype` to work as expected with complex types. + Closes :issue:`381`. + +- Fixed the :class:`tests.test_basics.ThreadingTestCase` test case. + Closes :issue:`359`. + +- Fix incomplete results when performing the same query twice and exhausting + the second iterator before the first. The first one writes incomplete + results to *seqcache* (:issue:`353`) + +- Fix false results potentially going to *seqcache* if + :meth:`tableextension.Row.update` is used during iteration + (see :issue:`353`) + +- Fix :meth:`Column.create_csindex` when there's NaNs + +- Fixed handling of unicode file names on windows (closes :issue:`389`) + +- No longer not modify :data:`sys.argv` at import time (closes :issue:`405`) + +- Fixed a performance issue on NFS (closes :issue:`402`) + +- Fixed a nasty problem affecting results of indexed queries. + Closes :issue:`319` and probably :issue:`419` too. + +- Fixed another problem affecting results of indexed queries too. + Closes :issue:`441`. + +- Replaced "len(xrange(start, stop, step))" -> "len(xrange(0, stop - + start, step))" to fix issues with large row counts with Python 2.x. + Fixes #447. + + +Other changes +------------- + +- Cython is not a hard dependency anymore (although developers will need it + so as to generated the C extension code). + +- The number of threads used by default for numexpr and Blosc operation that + was set to the number of available cores have been reduced to 2. This is + a much more reasonable setting for not creating too much overhead. + + + **Enjoy data!** + + -- The PyTables Developers + + +.. Local Variables: +.. mode: rst +.. coding: utf-8 +.. fill-column: 72 +.. End: diff --git a/doc/source/release-notes/RELEASE_NOTES_v3.3.x.rst b/doc/source/release-notes/RELEASE_NOTES_v3.3.x.rst new file mode 100644 index 0000000..4401990 --- /dev/null +++ b/doc/source/release-notes/RELEASE_NOTES_v3.3.x.rst @@ -0,0 +1,33 @@ +======================================= + Release notes for PyTables 3.3 series +======================================= + +Changes from 3.2.3.1 to 3.3.0 +============================= + +Improvements +------------ + +- Single codebase Python 2 and 3 support (PR #493). +- Internal Blosc version updated to 1.11.1 (closes :issue:`541`) +- Full BitShuffle support for new Blosc versions (>= 1.8). +- It is now possible to remove all rows from a table. +- It is now possible to read reference types by dereferencing them as + numpy array of objects (closes :issue:`518` and :issue:`519`). + Thanks to Ehsan Azar +- Get rid of the `-native` compile flag (closes :issue:`503`) +- The default number of threads to run numexpr (MAX_NUMEXPR_THREADS) + internally has been raised from 2 to 4. This is because we are in + 2016 and 4 core configurations are becoming common. +- In order to avoid locking issues when using PyTables concurrently in + several process, MAX_BLOSC_THREADS has been set to 1 by default. If + you are running PyTables in one single process, you may want to + experiment if higher values (like 2 or 4) bring better performance for + you. + +Bugs fixed +---------- + +- On python 3 try 'latin1' encoding before 'bytes' encoding during unpickling + of node attributes pickled on python 2. Better fix for :issue:`560`. +- Fixed Windows 32 and 64-bit builds. diff --git a/doc/source/release-notes/RELEASE_NOTES_v3.4.x.rst b/doc/source/release-notes/RELEASE_NOTES_v3.4.x.rst new file mode 100644 index 0000000..0d5ff07 --- /dev/null +++ b/doc/source/release-notes/RELEASE_NOTES_v3.4.x.rst @@ -0,0 +1,107 @@ +======================================= + Release notes for PyTables 3.4 series +======================================= + +Changes from 3.4.3 to 3.4.4 +=========================== + +Improvements +------------ + - Environment variable to control the use of embedded libraries. + Thanks to avalentino. + - Include citation in repository. + :issue:`690`. Thanks to katrinleinweber. + +Bugs fixed +---------- + - Fixed import error with numexpr 2.6.5.dev0 + :issue:`685`. Thanks to cgohlke. + - Fixed linter warnings. + Thanks to avalentino. + - Fixed for re.split() is version detection. + :issue:`687`. Thanks to mingwandroid. + - Fixed test failures with Python 2.7 and NumPy 1.14.3 + :issue:`688` & :issue:`689`. Thanks to oleksandr-pavlyk. + + +Changes from 3.4.2 to 3.4.3 +=========================== + +Improvements +------------ + - On interactive python sessions, group/attribute `__dir__()` method + autocompletes children that are named as valid python identifiers. + :issue:`624` & :issue:`625` thanks to ankostis. + - Implement `Group.__getitem__()` to have groups act as python-containers, + so code like this works: ``hfile.root['some child']``. + :issue:`628` thanks to ankostis. + - Enable building with Intel compiler (icc/icpc). + Thanks to rohit-jamuar. + - PEP 519 support, using new `os.fspath` method. + Thanks to mruffalo. + - Optional disable recording of ctime (metadata creation time) when + creating datasets that makes possible to get bitwise identical output + from repeated runs. + Thanks to alex-cobb. + - Prevent from reading all rows for each coord in a VLArray when + indexing using a list . + Thanks to igormq. + - Internal Blosc version updated to 1.14.3 + +Bugs fixed +---------- + - Fixed division by zero when using `_convert_time64()` with an empty + nparr array. + :issue:`653`. Thanks to alobbs. + - Fixed deprecation warnings with numpy 1.14. + Thanks to oleksandr-pavlyk. + - Skip DLL check when running from a frozen app. + :issue:`675`. Thanks to jwiggins. + - Fixed behaviour with slices out of range. + :issue:`651`. Thanks to jackdbd. + + +Changes from 3.4.1 to 3.4.2 +=========================== + +Improvements +------------ + - setup.py detects conda env and uses installed conda (hdf5, bzip2, lzo + and/or blosc) packages when building from source. + +Bugs fixed +---------- + - Linux wheels now built against built-in blosc. + - Fixed windows absolute paths in ptrepack, ptdump, ptree. + :issue:`616`. Thanks to oscar6echo. + + +Changes from 3.4.0 to 3.4.1 +=========================== + +Bugs fixed +---------- + - Fixed bug in ptrepack + + +Changes from 3.3.0 to 3.4.0 +=========================== + +Improvements +------------ + - Support for HDF5 v1.10.x (see :issue:`582`) + - Fix compatibility with the upcoming Python 2.7.13, 3.5.3 and 3.6 versions. + See also :issue:`590`. Thanks to Yaroslav Halchenko + - Internal Blosc version updated to 1.11.3 + - Gracefully handle cpuinfo failure. (PR #578) + Thanks to Zbigniew Jędrzejewski-Szmek + - Update internal py-cpuinfo to 3.3.0. Thanks to Gustavo Serra Scalet. + +Bugs fixed +---------- + - Fix conversion of python 2 `long` type to `six.integer_types` in atom.py. + See also :issue:`598`. Thanks to Kyle Keppler for reporting. + - Fix important bug in bitshuffle filter in internal Blosc on big-endian + machines. See also :issue:`583`. + - Fix allow for long type in nextafter. (PR #587) Thanks to Yaroslav Halchenko. + - Fix unicode bug in group and tables names. :issue:`514` diff --git a/doc/source/release-notes/RELEASE_NOTES_v3.5.x.rst b/doc/source/release-notes/RELEASE_NOTES_v3.5.x.rst new file mode 100644 index 0000000..b4630db --- /dev/null +++ b/doc/source/release-notes/RELEASE_NOTES_v3.5.x.rst @@ -0,0 +1,50 @@ +======================================= + Release notes for PyTables 3.5 series +======================================= + +:Author: PyTables Developers +:Contact: pytables-dev@googlegroups.com + +.. py:currentmodule:: tables + + +Changes from 3.5.1 to 3.5.2 +=========================== + +- Fixed compatibility with python 3.8: Fixed `Dictionary keys changed during + iteration` RuntimeError while moving/renaming a node. + Thanks to Christoph Gohlke for reporting and Miro Hrončok for help with + building PyTables for python 3.8alpha (cython compatibility). + see :issue:`733` and PR #737. +- Fixed a bug in offset calculations producing floats instead of ints + affecting python 3. See PR #736. Thanks to Brad Montgomery. + + +Changes from 3.5.0 to 3.5.1 +=========================== + +- Maintenance release to fix how PyPi repo is handling wheel versions. + + +Changes from 3.4.4 to 3.5.0 +=========================== + +Improvements +------------ + - When copying data from native HDF5 files with padding in compound types, + the padding is not removed now by default. This allows for better + compatibility with existing HDF5 applications that expect the padding + to stay there. + Also, when the `description` is a NumPy struct array with padding, this + is honored now. The previous behaviour (i.e. getting rid of paddings) can + be replicated by passing the new `allow_padding` parameter when opening + a file. For some examples, see the new `examples/tables-with-padding.py` + and `examples/attrs-with-padding.py`. For details on the implementation + see :issue:`720`. + - Added a new flag `--dont-allow-padding` in `ptrepack` utility so as to + replicate the previous behaviour of removing padding during file copies. + The default is to honor the original padding in copies. + - Improve compatibility with numpy 1.16. + - Improve detection of the LZO2 library at build time. + - Suppress several warnings. + - Add AVX2 support for Windows. See PR #716. Thanks to Robert McLeod. diff --git a/doc/source/release-notes/RELEASE_NOTES_v3.6.x.rst b/doc/source/release-notes/RELEASE_NOTES_v3.6.x.rst new file mode 100644 index 0000000..fd5fdcb --- /dev/null +++ b/doc/source/release-notes/RELEASE_NOTES_v3.6.x.rst @@ -0,0 +1,41 @@ +======================================= + Release notes for PyTables 3.6 series +======================================= + +:Author: PyTables Developers +:Contact: pytables-dev@googlegroups.com + +.. py:currentmodule:: tables + + +Changes from 3.6.0 to 3.6.1 +=========================== + +Maintenance release to fix packaging issues. No new features or bugfixes. + + +Changes from 3.5.3 to 3.6.0 +=========================== + +PyTables 3.6 no longer supports Python 2.7 see PR #747. + +Improvements +------------ +- Full python 3.8 support. +- On Windows PyTables wheels on PyPI are linked to `pytables_hdf5.dll` instead + of `hdf5.dll` to prevent collisions with other packages/wheels that also + vendor `hdf5.dll`. + This should prevent problems that arise when a different version of a dll + is imported that the version to which the program was linked to. + This problem is known as "DLL Hell". + With the renaming of the HDF5 DLL to `pytables_hdf5.dll` these problems + should be solved. + +Bugfixes +-------- +- Bugfix for HDF5 files/types with padding. For details see :issue:`734`. +- More fixes for python 3.8 compatibility: Replace deprecated time.clock + with time.perf_counter + Thanks to Sergio Pascual (sergiopasra). see :issue:`744` and PR #745. +- Improvements in tests as well as clean up from dropping Python 2.7 support. + Thanks to Seth Troisi (sethtroisi). diff --git a/doc/source/release-notes/RELEASE_NOTES_v3.7.x.rst b/doc/source/release-notes/RELEASE_NOTES_v3.7.x.rst new file mode 100644 index 0000000..6adc4af --- /dev/null +++ b/doc/source/release-notes/RELEASE_NOTES_v3.7.x.rst @@ -0,0 +1 @@ +.. include:: ../../../RELEASE_NOTES.rst diff --git a/doc/source/release_notes.rst b/doc/source/release_notes.rst new file mode 100644 index 0000000..07f596d --- /dev/null +++ b/doc/source/release_notes.rst @@ -0,0 +1,141 @@ +====================== +PyTables Release Notes +====================== + +Migration +---------- + +.. toctree:: + :maxdepth: 1 + + Migrating from 2.x to 3.x + Migrating from 1.x to 2.x + +PyTables +-------- + +.. toctree:: + :maxdepth: 1 + + release-notes/RELEASE_NOTES_v3.7.x + release-notes/RELEASE_NOTES_v3.6.x + release-notes/RELEASE_NOTES_v3.5.x + release-notes/RELEASE_NOTES_v3.4.x + release-notes/RELEASE_NOTES_v3.3.x + release-notes/RELEASE_NOTES_v3.2.x + release-notes/RELEASE_NOTES_v3.1.x + release-notes/RELEASE_NOTES_v3.0.x + release-notes/RELEASE_NOTES_v2.4.x + release-notes/RELEASE_NOTES_v2.3.x + release-notes/RELEASE_NOTES_v2.2.x + release-notes/RELEASE_NOTES_v2.1.x + release-notes/RELEASE_NOTES_v2.0.x + release-notes/RELEASE_NOTES_v1.4 + release-notes/RELEASE_NOTES_v1.3.3 + release-notes/RELEASE_NOTES_v1.3.2 + release-notes/RELEASE_NOTES_v1.3.1 + release-notes/RELEASE_NOTES_v1.3 + release-notes/RELEASE_NOTES_v1.2.3 + release-notes/RELEASE_NOTES_v1.2.2 + release-notes/RELEASE_NOTES_v1.2.1 + release-notes/RELEASE_NOTES_v1.2 + release-notes/RELEASE_NOTES_v1.1.1 + release-notes/RELEASE_NOTES_v1.1 + release-notes/RELEASE_NOTES_v1.0 + release-notes/RELEASE_NOTES_v0.9.1 + release-notes/RELEASE_NOTES_v0.9 + release-notes/RELEASE_NOTES_v0.8 + release-notes/RELEASE_NOTES_v0.7.2 + release-notes/RELEASE_NOTES_v0.7.1 + + +PyTables Pro +------------ + +.. toctree:: + :maxdepth: 1 + + release-notes/RELEASE_NOTES_v2.2.x-pro + release-notes/RELEASE_NOTES_v2.1.x-pro + release-notes/RELEASE_NOTES_v2.0.x-pro + + +Release timeline +---------------- + +=============== =========== ========== +PyTables 3.5.2 2019-05-31 +PyTables 3.5.1 2019-03-14 +PyTables 3.5.0 2019-03-13 +PyTables 3.4.4 2018-06-11 +PyTables 3.4.3 2018-04-17 +PyTables 3.4.2 2017-04-19 +PyTables 3.4.1 2017-04-12 +PyTables 3.4.0 2017-04-11 +PyTables 3.3.0 2016-09-12 +PyTables 3.2.3.1 2016-07-05 +PyTables 3.2.3 2016-07-04 +PyTables 3.2.2 2015-09-22 +PyTables 3.2.1.1 2015-08-31 +PyTables 3.2.1 2015-08-04 +PyTables 3.2.0 2015-05-06 +PyTables 3.2.0rc2 2015-05-01 +PyTables 3.2.0rc1 2015-04-21 +PyTables 3.1.1 2014-03-25 +PyTables 3.1.0 2014-02-05 +PyTables 3.1.0rc2 2014-01-22 +PyTables 3.1.0rc1 2014-01-17 +PyTables 3.0 2013-06-01 +PyTables 3.0rc3 2013-05-29 +PyTables 3.0rc2 2013-05-17 +PyTables 3.0rc1 2013-05-10 +PyTables 3.0b1 2013-04-27 +PyTables 2.4 2012-07-20 +PyTables 2.4rc1 2012-07-16 +PyTables 2.4b1 2012-07-07 +PyTables 2.3.1 2011-10-28 +PyTables 2.3 2011-09-23 +PyTables 2.3rc1 2011-09-11 +PyTables 2.2.1 2010-11-05 +PyTables 2.2.1rc1 2010-11-03 +Pytables 2.2 2010-07-01 +PyTables 2.2rc2 2010-06-17 +PyTables 2.2rc1 2010-05-20 +PyTables 2.1.2 2009-09-14 +PyTables 2.1.1 2009-03-13 +PyTables Pro 2.1.1 2009-03-13 +PyTables 2.1 2008-12-19 +PyTables 2.1rc2 2008-11-18 +PyTables 2.1rc1 2008-10-31 +PyTables 2.0.4 2008-07-05 +PyTables Pro 2.0.4 2008-07-05 +PyTables 2.0.3 2008-03-07 +PyTables Pro 2.0.2.1 2007-12-24 +PyTables Pro 2.0.1 2007-09-20 +PyTables 2.0.1 2007-09-20 +PyTables Pro 2.0 2007-07-12 +PyTables 2.0 2007-07-12 +PyTables 2.0rc2 2007-05-28 +PyTables 2.0rc1 2007-04-26 +PyTables 1.4 2006-12-21 +PyTables 1.3.3 2006-08-24 +PyTables 1.3.2 2006-06-20 +PyTables 1.3.1 2006-05-02 +PyTables 1.3 2006-04-01 +PyTables 1.2.3 2006-02-23 +PyTables 1.2.2 2006-02-16 +PyTables 1.2.1 2005-12-21 +PyTables 1.2 2005-11-22 +PyTables 1.1.1 2005-09-13 +PyTables 1.1 2005-07-14 +PyTables 1.0 2005-05-12 +PyTables 0.9.1 2004-12-02 +PyTables- 0.9 2004-11-08 +PyTables 0.8.1 2004-07-13 +PyTables 0.8 2004-03-03 +PyTables 0.7.2 2003-09-22 +PyTables 0.7 2003-07-31 +PyTables 0.5.1 2003-05-14 +PyTables 0.5 2003-05-10 +Pytables 0.4 2003-03-19 +=============== =========== ========== diff --git a/doc/source/usersguide/bibliography.rst b/doc/source/usersguide/bibliography.rst new file mode 100644 index 0000000..48bb1a0 --- /dev/null +++ b/doc/source/usersguide/bibliography.rst @@ -0,0 +1,162 @@ +Bibliography +============ + +.. _HDFG1: + +:ref:`[HDFG1] ` + The HDF Group. What is HDF5?. Concise description about HDF5 capabilities + and its differences from earlier versions (HDF4). + ``_. + +.. _HDFG2: + +:ref:`[HDFG2] ` + The HDF Group. Introduction to HDF5. Introduction to the HDF5 data model + and programming model. ``_. + +.. _HDFG3: + +:ref:`[HDFG3] ` + The HDF Group. The HDF5 table programming model. Examples on using HDF5 + tables with the C API. ``_. + +.. _MERTZ: + +:ref:`[MERTZ] ` + David Mertz. Objectify. On the 'Pythonic' treatment of XML documents as + objects(II). Article describing XML Objectify, a Python module that + allows working with XML documents as Python objects. + Some of the ideas presented here are used in PyTables. + ``_. + +.. _CYTHON: + +:ref:`[CYTHON] ` + Stefan Behnel, Robert Bradshaw, Dag Sverre Seljebotn, and Greg Ewing. + Cython. A language that makes writing C extensions for the Python + language as easy as Python itself. ``_. + +.. _NUMPY: + +:ref:`[NUMPY] ` + Travis Oliphant and et al. NumPy. Scientific Computing with Numerical + Python. The latest and most powerful re-implementation of Numeric to + date. + It implements all the features that can be found in Numeric and numarray, + plus a bunch of new others. In general, it is more efficient as well. + ``_. + +.. _NUMEXPR: + +:ref:`[NUMEXPR] ` + David Cooke, Francesc Alted, and et al. Numexpr. Fast evaluation of array + expressions by using a vector-based virtual machine. + It is an enhaced computing kernel that is generally faster (between 1x + and 10x, depending on the kind of operations) than NumPy at evaluating + complex array expressions. ``_. + +.. _ZLIB: + +:ref:`[ZLIB] ` + JeanLoup Gailly and Mark Adler. zlib. A Massively Spiffy Yet Delicately + Unobtrusive Compression Library. A standard library for compression + purposes. ``_. + +.. _LZO: + +:ref:`[LZO] ` + Markus F Oberhumer. LZO. A data compression library which is suitable for + data de-/compression in real-time. It offers pretty fast compression and + decompression with reasonable compression ratio. + ``_. + +.. _BZIP2: + +:ref:`[BZIP2] ` + Julian Seward. bzip2. A high performance lossless compressor. + It offers very high compression ratios within reasonable times. + ``_. + +.. _BLOSC: + +:ref:`[BLOSC] ` + Francesc Alted. Blosc. A blocking, shuffling and loss-less compression + library. A compressor designed to transmit data from memory to CPU + (and back) faster than a plain memcpy(). + ``_. + +.. _GNUWIN32: + +:ref:`[GNUWIN32] ` + Alexis Wilke, Jerry S., Kees Zeelenberg, and Mathias Michaelis. + GnuWin32. GNU (and other) tools ported to Win32. + GnuWin32 provides native Win32-versions of GNU tools, or tools with a + similar open source licence. + ``_. + +.. _PSYCO: + +:ref:`[PSYCO] ` + Armin Rigo. Psyco. A Python specializing compiler. + Run existing Python software faster, with no change in your source. + ``_. + +.. _SCIPY1: + +:ref:`[SCIPY1] ` + Konrad Hinsen. Scientific Python. Collection of Python modules useful for + scientific computing. + ``_. + +.. _SCIPY2: + +:ref:`[SCIPY2] ` + Eric Jones, Travis Oliphant, Pearu Peterson, and et al. SciPy. + Scientific tools for Python. SciPy supplements the popular Numeric module, + gathering a variety of high level science and engineering modules + together as a single package. + ``_. + +.. _OPTIM: + +:ref:`[OPTIM] ` + Francesc Alted and Ivan Vilata. Optimization of file openings in PyTables. + This document explores the savings of the opening process in terms of + both CPU time and memory, due to the adoption of a LRU cache for the + nodes in the object tree. + ``_. + +.. _OPSI: + +:ref:`[OPSI] ` + Francesc Alted and Ivan Vilata. OPSI: The indexing system of PyTables 2 + Professional Edition. Exhaustive description and benchmarks about the + indexing engine that comes with PyTables Pro. + ``_. + +.. _VITABLES: + +:ref:`[VITABLES] ` + Vicent Mas. ViTables. A GUI for PyTables/HDF5 files. + It is a graphical tool for browsing and editing files in both PyTables + and HDF5 formats. + ``_. + +.. _GIT: + +:ref:`[GIT] ` + Git is a free and open source, distributed version control system designed + to handle everything from small to very large projects with speed and + efficiency ``_. + +.. _SPHINX: + +:ref:`[SPHINX] ` + Sphinx is a tool that makes it easy to create intelligent and beautiful + documentation, written by Georg Brandl and licensed under the BSD license + ``_. + +.. |Kuepper| unicode:: K U+00FC pper .. Kuepper + +.. todo:: remove the above substitution. It is no more needed with sphinx + 1.0.8 diff --git a/doc/source/usersguide/condition_syntax.rst b/doc/source/usersguide/condition_syntax.rst new file mode 100644 index 0000000..6d5f5ff --- /dev/null +++ b/doc/source/usersguide/condition_syntax.rst @@ -0,0 +1,138 @@ +.. _condition_syntax: + +Condition Syntax +================ +.. currentmodule:: tables + +Conditions in PyTables are used in methods related with in-kernel and indexed +searches such as :meth:`Table.where` or :meth:`Table.read_where`. +They are interpreted using Numexpr, a powerful package for achieving C-speed +computation of array operations (see :ref:`[NUMEXPR] `). + +A condition on a table is just a *string* containing a Python expression +involving *at least one column*, and maybe some constants and external +variables, all combined with algebraic operators and functions. The result of +a valid condition is always a *boolean array* of the same length as the +table, where the *i*-th element is true if the value of the expression on the +*i*-th row of the table evaluates to true + +That is the reason why multidimensional fields in a table are not supported +in conditions, since the truth value of each resulting multidimensional +boolean value is not obvious. +Usually, a method using a condition will only consider the rows where the +boolean result is true. + +For instance, the condition 'sqrt(x*x + y*y) < 1' applied on a table with x +and y columns consisting of floating point numbers results in a boolean array +where the *i*-th element is true if (unsurprisingly) the value of the square +root of the sum of squares of x and y is less than 1. +The sqrt() function works element-wise, the 1 constant is adequately +broadcast to an array of ones of the length of the table for evaluation, and +the *less than* operator makes the result a valid boolean array. A condition +like 'mycolumn' alone will not usually be valid, unless mycolumn is itself a +column of scalar, boolean values. + +In the previous conditions, mycolumn, x and y are examples of *variables* +which are associated with columns. +Methods supporting conditions do usually provide their own ways of binding +variable names to columns and other values. You can read the documentation of +:meth:`Table.where` for more information on that. Also, please note that the +names None, True and False, besides the names of functions (see below) *can +not be overridden*, but you can always define other new names for the objects +you intend to use. + +Values in a condition may have the following types: + +- 8-bit boolean (bool). + +- 32-bit signed integer (int). + +- 64-bit signed integer (long). + +- 32-bit, single-precision floating point number (float or float32). + +- 64-bit, double-precision floating point number (double or float64). + +- 2x64-bit, double-precision complex number (complex). + +- Raw string of bytes (str). + +Nevertheless, if the type passed is not among the above ones, it will be +silently upcasted, so you don't need to worry too much about passing +supported types, except for the Unsigned 64 bits integer, that cannot be +upcasted to any of the supported types. + +However, the types in PyTables conditions are somewhat stricter than those of +Python. For instance, the *only* valid constants for booleans are True and +False, and they are *never* automatically cast to integers. The type +strengthening also affects the availability of operators and functions. +Beyond that, the usual type inference rules apply. + +Conditions support the set of operators listed below: + +- Logical operators: &, \|, ~. + +- Comparison operators: <, <=, ==, !=, >=, >. + +- Unary arithmetic operators: -. + +- Binary arithmetic operators: +, -, \*, /, \**, %. + +Types do not support all operators. Boolean values only support logical and +strict (in)equality comparison operators, while strings only support +comparisons, numbers do not work with logical operators, and complex +comparisons can only check for strict (in)equality. Unsupported operations +(including invalid castings) raise NotImplementedError exceptions. + +You may have noticed the special meaning of the usually bitwise operators &, +| and ~. Because of the way Python handles the short-circuiting of logical +operators and the truth values of their operands, conditions must use the +bitwise operator equivalents instead. +This is not difficult to remember, but you must be careful because bitwise +operators have a *higher precedence* than logical operators. For instance, +'a and b == c' (*a is true AND b is equal to c*) is *not* equivalent to +'a & b == c' (*a AND b is equal to c)*. The safest way to avoid confusions is +to *use parentheses* around logical operators, like this: 'a & (b == c)'. +Another effect of short-circuiting is that expressions like '0 < x < 1' will +*not* work as expected; you should use '(0 < x) & (x < 1)'. + +All of this may be solved if Python supported overloadable boolean operators +(see PEP 335) or some kind of non-shortcircuiting boolean operators (like C's +&&, || and !). + +You can also use the following functions in conditions: + +- where(bool, number1, number2): + number - number1 if the bool condition is true, number2 otherwise. + +- {sin,cos,tan}(float|complex): + float|complex - trigonometric sine, cosine or tangent. + +- {arcsin,arccos,arctan}(float|complex): + float|complex - trigonometric inverse sine, cosine or tangent. + +- arctan2(float1, float2): + float - trigonometric inverse tangent of float1/float2. + +- {sinh,cosh,tanh}(float|complex): + float|complex - hyperbolic sine, cosine or tangent. + +- {arcsinh,arccosh,arctanh}(float|complex): + float|complex - hyperbolic inverse sine, cosine or tangent. + +- {log,log10,log1p}(float|complex): + float|complex - natural, base-10 and log(1+x) logarithms. + +- {exp,expm1}(float|complex): + float|complex - exponential and exponential minus one. + +- sqrt(float|complex): float|complex - square root. + +- abs(float|complex): float|complex - absolute value. + +- {real,imag}(complex): + float - real or imaginary part of complex. + +- complex(float, float): + complex - complex from real and imaginary parts. + diff --git a/doc/source/usersguide/datatypes.rst b/doc/source/usersguide/datatypes.rst new file mode 100644 index 0000000..b1d717d --- /dev/null +++ b/doc/source/usersguide/datatypes.rst @@ -0,0 +1,117 @@ +.. _datatypes: + +Supported data types in PyTables +================================ + +All PyTables datasets can handle the complete set of data types supported by +the NumPy (see :ref:`[NUMPY] `) package in Python. +The data types for table fields can be set via instances of the Col class and +its descendants (see :ref:`ColClassDescr`), while the data type of array +elements can be set through the use of the Atom class and its descendants +(see :ref:`AtomClassDescr`). + +PyTables uses ordinary strings to represent its *types*, with most of them +matching the names of NumPy scalar types. Usually, a PyTables type consists +of two parts: a *kind* and a *precision* in bits. +The precision may be omitted in types with just one supported precision (like +bool) or with a non-fixed size (like string). + +There are eight kinds of types supported by PyTables: + +- bool: Boolean (true/false) types. + Supported precisions: 8 (default) bits. + +- int: Signed integer types. + Supported precisions: 8, 16, 32 (default) and 64 bits. + +- uint: Unsigned integer types. + Supported precisions: 8, 16, 32 (default) and 64 bits. + +- float: Floating point types. + Supported precisions: 16, 32, 64 (default) bits and extended precision + floating point (see + :ref:`note on floating point types`). + +- complex: Complex number types. + Supported precisions: 64 (32+32), 128 (64+64, default) bits and extended + precision complex (see + :ref:`note on floating point types`). + +- string: Raw string types. + Supported precisions: 8-bit positive multiples. + +- time: Data/time types. + Supported precisions: 32 and 64 (default) bits. + +- enum: Enumerated types. + Precision depends on base type. + +.. _floating-point-note: +.. note:: Floating point types. + + The half precision floating point data type (float16) and extended + precision ones (fload96, float128, complex192, complex256) are only + available if numpy_ supports them on the host platform. + + Also, in order to use the half precision floating point type (float16) + it is required numpy_ >= 1.6.0. + +.. _numpy: http://www.numpy.org + +The time and enum kinds area little bit special, since they represent HDF5 +types which have no direct Python counterpart, though atoms of these kinds +have a more-or-less equivalent NumPy data type. + +There are two types of time: 4-byte signed integer (time32) and 8-byte double +precision floating point (time64). Both of them reflect the number of seconds +since the Unix epoch, i.e. Jan 1 00:00:00 UTC 1970. They are stored in memory +as NumPy's int32 and float64, respectively, and in the HDF5 file using the +H5T_TIME class. Integer times are stored on disk as such, while floating +point times are split into two signed integer values representing seconds and +microseconds (beware: smaller decimals will be lost!). + +PyTables also supports HDF5 H5T_ENUM *enumerations* (restricted sets of +unique name and unique value pairs). The NumPy representation of an +enumerated value (an Enum, see :ref:`EnumClassDescr`) depends on the concrete +*base type* used to store the enumeration in the HDF5 file. +Currently, only scalar integer values (both signed and unsigned) are +supported in enumerations. This restriction may be lifted when HDF5 supports +other kinds on enumerated values. + +Here you have a quick reference to the complete set of supported data types: + +.. table:: **Data types supported for array elements and tables columns in + PyTables.** + + ================== ========================== ====================== =============== ================== + Type Code Description C Type Size (in bytes) Python Counterpart + ================== ========================== ====================== =============== ================== + bool boolean unsigned char 1 bool + int8 8-bit integer signed char 1 int + uint8 8-bit unsigned integer unsigned char 1 int + int16 16-bit integer short 2 int + uint16 16-bit unsigned integer unsigned short 2 int + int32 integer int 4 int + uint32 unsigned integer unsigned int 4 long + int64 64-bit integer long long 8 long + uint64 unsigned 64-bit integer unsigned long long 8 long + float16 [1]_ half-precision float - 2 - + float32 single-precision float float 4 float + float64 double-precision float double 8 float + float96 [1]_ [2]_ extended precision float - 12 - + float128 [1]_ [2]_ extended precision float - 16 - + complex64 single-precision complex struct {float r, i;} 8 complex + complex128 double-precision complex struct {double r, i;} 16 complex + complex192 [1]_ extended precision complex - 24 - + complex256 [1]_ extended precision complex - 32 - + string arbitrary length string char[] * str + time32 integer time POSIX's time_t 4 int + time64 floating point time POSIX's struct timeval 8 float + enum enumerated value enum - - + ================== ========================== ====================== =============== ================== + +.. rubric:: Footnotes + +.. [1] see the above :ref:`note on floating point types `. +.. [2] currently in numpy_. "float96" and "float128" are equivalent of + "longdouble" i.e. 80 bit extended precision floating point. diff --git a/doc/source/usersguide/file_format.rst b/doc/source/usersguide/file_format.rst new file mode 100644 index 0000000..170e0e8 --- /dev/null +++ b/doc/source/usersguide/file_format.rst @@ -0,0 +1,351 @@ +PyTables File Format +==================== +PyTables has a powerful capability to deal with native HDF5 files created +with another tools. However, there are situations were you may want to create +truly native PyTables files with those tools while retaining fully +compatibility with PyTables format. That is perfectly possible, and in this +appendix is presented the format that you should endow to your own-generated +files in order to get a fully PyTables compatible file. + +We are going to describe the *2.0 version of PyTables file format* +(introduced in PyTables version 2.0). As time goes by, some changes might be +introduced (and documented here) in order to cope with new necessities. +However, the changes will be carefully pondered so as to ensure backward +compatibility whenever is possible. + +A PyTables file is composed with arbitrarily large amounts of HDF5 groups +(Groups in PyTables naming scheme) and datasets (Leaves in PyTables naming +scheme). For groups, the only requirements are that they must have some +*system attributes* available. By convention, system attributes in PyTables +are written in upper case, and user attributes in lower case but this is not +enforced by the software. In the case of datasets, besides the mandatory +system attributes, some conditions are further needed in their storage +layout, as well as in the datatypes used in there, as we will see shortly. + +As a final remark, you can use any filter as you want to create a PyTables +file, provided that the filter is a standard one in HDF5, like *zlib*, +*shuffle* or *szip* (although the last one can not be used from within +PyTables to create a new file, datasets compressed with szip can be read, +because it is the HDF5 library which do the decompression transparently). + + +.. currentmodule:: tables + +Mandatory attributes for a File +------------------------------- +The File object is, in fact, an special HDF5 *group* structure that is *root* +for the rest of the objects on the object tree. The next attributes are +mandatory for the HDF5 *root group* structure in PyTables files: + +* *CLASS*: This attribute should always be set to 'GROUP' for group + structures. +* *PYTABLES_FORMAT_VERSION*: It represents the internal format version, and + currently should be set to the '2.0' string. +* *TITLE*: A string where the user can put some description on what is this + group used for. +* *VERSION*: Should contains the string '1.0'. + + +Mandatory attributes for a Group +-------------------------------- +The next attributes are mandatory for *group* structures: + +* *CLASS*: This attribute should always be set to 'GROUP' for group structures. +* *TITLE*: A string where the user can put some description on what is this + group used for. +* *VERSION*: Should contains the string '1.0'. + + +Optional attributes for a Group +------------------------------- +The next attributes are optional for *group* structures: + +* *FILTERS*: When present, this attribute contains the filter properties (a + Filters instance, see section :ref:`FiltersClassDescr`) that may be + inherited by leaves or groups created immediately under this group. This is + a packed 64-bit integer structure, where + + - *byte 0* (the least-significant byte) is the compression level + (complevel). + - *byte 1* is the compression library used (complib): 0 when irrelevant, 1 + for Zlib, 2 for LZO and 3 for Bzip2. + - *byte 2* indicates which parameterless filters are enabled (shuffle and + fletcher32): bit 0 is for *Shuffle* while bit 1 is for*Fletcher32*. + - other bytes are reserved for future use. + + +Mandatory attributes, storage layout and supported data types for Leaves +------------------------------------------------------------------------ +This depends on the kind of Leaf. The format for each type follows. + + +.. _TableFormatDescr: + +Table format +~~~~~~~~~~~~ + +Mandatory attributes +^^^^^^^^^^^^^^^^^^^^ +The next attributes are mandatory for *table* structures: + +* *CLASS*: Must be set to 'TABLE'. +* *TITLE*: A string where the user can put some description on what is this + dataset used for. +* *VERSION*: Should contain the string '2.6'. +* *FIELD_X_NAME*: It contains the names of the different fields. The X means + the number of the field, zero-based (beware, order do matter). You should + add as many attributes of this kind as fields you have in your records. +* *FIELD_X_FILL*: It contains the default values of the different fields. All + the datatypes are supported natively, except for complex types that are + currently serialized using Pickle. The X means the number of the field, + zero-based (beware, order do matter). You should add as many attributes of + this kind as fields you have in your records. These fields are meant for + saving the default values persistently and their existence is optional. +* *NROWS*: This should contain the number of *compound* data type entries in + the dataset. It must be an *int* data type. + + +Storage Layout +^^^^^^^^^^^^^^ +A Table has a *dataspace* with a *1-dimensional chunked* layout. + +Datatypes supported +^^^^^^^^^^^^^^^^^^^ +The datatype of the elements (rows) of Table must be the H5T_COMPOUND +*compound* data type, and each of these compound components must be built +with only the next HDF5 data types *classes*: + +* *H5T_BITFIELD*: This class is used to represent the Bool type. Such a type + must be build using a H5T_NATIVE_B8 datatype, followed by a HDF5 + H5Tset_precision call to set its precision to be just 1 bit. +* *H5T_INTEGER*: This includes the next data types: + * *H5T_NATIVE_SCHAR*: This represents a *signed char* C type, but it is + effectively used to represent an Int8 type. + * *H5T_NATIVE_UCHAR*: This represents an *unsigned char* C type, but it + is effectively used to represent an UInt8 type. + * *H5T_NATIVE_SHORT*: This represents a *short* C type, and it is + effectively used to represent an Int16 type. + * *H5T_NATIVE_USHORT*: This represents an *unsigned short* C type, and it + is effectively used to represent an UInt16 type. + * *H5T_NATIVE_INT*: This represents an *int* C type, and it is + effectively used to represent an Int32 type. + * *H5T_NATIVE_UINT*: This represents an *unsigned int* C type, and it is + effectively used to represent an UInt32 type. + * *H5T_NATIVE_LONG*: This represents a *long* C type, and it is + effectively used to represent an Int32 or an Int64, depending on + whether you are running a 32-bit or 64-bit architecture. + * *H5T_NATIVE_ULONG*: This represents an *unsigned long* C type, and it + is effectively used to represent an UInt32 or an UInt64, depending on + whether you are running a 32-bit or 64-bit architecture. + * *H5T_NATIVE_LLONG*: This represents a *long long* C type (__int64, if + you are using a Windows system) and it is effectively used to represent + an Int64 type. + * *H5T_NATIVE_ULLONG*: This represents an *unsigned long long* C type + (beware: this type does not have a correspondence on Windows systems) + and it is effectively used to represent an UInt64 type. +* *H5T_FLOAT*: This includes the next datatypes: + * *H5T_NATIVE_FLOAT*: This represents a *float* C type and it is + effectively used to represent an Float32 type. + * *H5T_NATIVE_DOUBLE*: This represents a *double* C type and it is + effectively used to represent an Float64 type. +* *H5T_TIME*: This includes the next datatypes: + * *H5T_UNIX_D32*: This represents a POSIX *time_t* C type and it is + effectively used to represent a 'Time32' aliasing type, which + corresponds to an Int32 type. + * *H5T_UNIX_D64*: This represents a POSIX *struct timeval* C type and it + is effectively used to represent a 'Time64' aliasing type, which + corresponds to a Float64 type. +* *H5T_STRING*: The datatype used to describe strings in PyTables is H5T_C_S1 + (i.e. a *string* C type) followed with a call to the HDF5 H5Tset_size() + function to set their length. +* *H5T_ARRAY*: This allows the construction of homogeneous, multidimensional + arrays, so that you can include such objects in compound records. The types + supported as elements of H5T_ARRAY data types are the ones described above. + Currently, PyTables does not support nested H5T_ARRAY types. +* *H5T_COMPOUND*: This allows the support for datatypes that are compounds of + compounds (this is also known as *nested types* along this manual). + + This support can also be used for defining complex numbers. Its format is + described below: + + The H5T_COMPOUND type class contains two members. Both members must have + the H5T_FLOAT atomic datatype class. The name of the first member should be + "r" and represents the real part. The name of the second member should be + "i" and represents the imaginary part. The *precision* property of both of + the H5T_FLOAT members must be either 32 significant bits (e.g. + H5T_NATIVE_FLOAT) or 64 significant bits (e.g. H5T_NATIVE_DOUBLE). They + represent Complex32 and Complex64 types respectively. + + +Array format +~~~~~~~~~~~~ + +Mandatory attributes +^^^^^^^^^^^^^^^^^^^^ +The next attributes are mandatory for *array* structures: + +* *CLASS*: Must be set to 'ARRAY'. +* *TITLE*: A string where the user can put some description on what is this + dataset used for. +* *VERSION*: Should contain the string '2.3'. + + +Storage Layout +^^^^^^^^^^^^^^ +An Array has a *dataspace* with a *N-dimensional contiguous* layout (if you +prefer a *chunked* layout see EArray below). + + +Datatypes supported +^^^^^^^^^^^^^^^^^^^ +The elements of Array must have either HDF5 *atomic* data types or a +*compound* data type representing a complex number. The atomic data types can +currently be one of the next HDF5 data type *classes*: H5T_BITFIELD, +H5T_INTEGER, H5T_FLOAT and H5T_STRING. The H5T_TIME class is also supported +for reading existing Array objects, but not for creating them. See the Table +format description in :ref:`TableFormatDescr` for more info about these +types. + +In addition to the HDF5 atomic data types, the Array format supports complex +numbers with the H5T_COMPOUND data type class. +See the Table format description in :ref:`TableFormatDescr` for more info +about this special type. + +You should note that H5T_ARRAY class datatypes are not allowed in Array +objects. + + +CArray format +~~~~~~~~~~~~~ + +Mandatory attributes +^^^^^^^^^^^^^^^^^^^^ +The next attributes are mandatory for *CArray* structures: + +* *CLASS*: Must be set to 'CARRAY'. +* *TITLE*: A string where the user can put some description on what is this + dataset used for. +* *VERSION*: Should contain the string '1.0'. + + +Storage Layout +^^^^^^^^^^^^^^ +An CArray has a *dataspace* with a *N-dimensional chunked* layout. + +Datatypes supported +^^^^^^^^^^^^^^^^^^^ +The elements of CArray must have either HDF5 *atomic* data types or a +*compound* data type representing a complex number. The atomic data types can +currently be one of the next HDF5 data type *classes*: H5T_BITFIELD, +H5T_INTEGER, H5T_FLOAT and H5T_STRING. The H5T_TIME class is also supported +for reading existing CArray objects, but not for creating them. See the Table +format description in :ref:`TableFormatDescr` for more info about these +types. + +In addition to the HDF5 atomic data types, the CArray format supports complex +numbers with the H5T_COMPOUND data type class. +See the Table format description in :ref:`TableFormatDescr` for more info +about this special type. + +You should note that H5T_ARRAY class datatypes are not allowed yet in Array +objects. + + +EArray format +~~~~~~~~~~~~~ + +Mandatory attributes +^^^^^^^^^^^^^^^^^^^^ +The next attributes are mandatory for *earray* structures: + +* *CLASS*: Must be set to 'EARRAY'. +* *EXTDIM*: (*Integer*) Must be set to the extendable dimension. Only one + extendable dimension is supported right now. +* *TITLE*: A string where the user can put some description on what is this + dataset used for. +* *VERSION*: Should contain the string '1.3'. + + +Storage Layout +^^^^^^^^^^^^^^ +An EArray has a *dataspace* with a *N-dimensional chunked* layout. + + +Datatypes supported +^^^^^^^^^^^^^^^^^^^ +The elements of EArray are allowed to have the same data types as for the +elements in the Array format. They can be one of the HDF5 *atomic* data type +*classes*: H5T_BITFIELD, H5T_INTEGER, H5T_FLOAT, H5T_TIME or H5T_STRING, see +the Table format description in :ref:`TableFormatDescr` for more info about +these types. They can also be a H5T_COMPOUND datatype representing a complex +number, see the Table format description in :ref:`TableFormatDescr`. + +You should note that H5T_ARRAY class data types are not allowed in EArray +objects. + + +.. _VLArrayFormatDescr: + +VLArray format +~~~~~~~~~~~~~~ + +Mandatory attributes +^^^^^^^^^^^^^^^^^^^^ +The next attributes are mandatory for *vlarray* structures: + +* *CLASS*: Must be set to 'VLARRAY'. +* *PSEUDOATOM*: This is used so as to specify the kind of pseudo-atom (see + :ref:`VLArrayFormatDescr`) for the VLArray. It can take the values + 'vlstring', 'vlunicode' or 'object'. If your atom is not a pseudo-atom then + you should not specify it. +* *TITLE*: A string where the user can put some description on what is this + dataset used for. +* *VERSION*: Should contain the string '1.3'. + + +Storage Layout +^^^^^^^^^^^^^^ +An VLArray has a *dataspace* with a *1-dimensional chunked* layout. + + +Data types supported +^^^^^^^^^^^^^^^^^^^^ +The data type of the elements (rows) of VLArray objects must be the H5T_VLEN +*variable-length* (or VL for short) datatype, and the base datatype specified +for the VL datatype can be of any *atomic* HDF5 datatype that is listed in +the Table format description :ref:`TableFormatDescr`. That includes the +classes: + +- H5T_BITFIELD +- H5T_INTEGER +- H5T_FLOAT +- H5T_TIME +- H5T_STRING +- H5T_ARRAY + +They can also be a H5T_COMPOUND data type representing a complex number, see +the Table format description in :ref:`TableFormatDescr` for a detailed +description. + +You should note that this does not include another VL datatype, or a compound +datatype that does not fit the description of a complex number. Note as well +that, for object and vlstring pseudo-atoms, the base for the VL datatype is +always a H5T_NATIVE_UCHAR (H5T_NATIVE_UINT for vlunicode). That means that +the complete row entry in the dataset has to be used in order to fully +serialize the object or the variable length string. + + +Optional attributes for Leaves +------------------------------ +The next attributes are optional for *leaves*: + +* *FLAVOR*: This is meant to provide the information about the kind of object + kept in the Leaf, i.e. when the dataset is read, it will be converted to + the indicated flavor. + It can take one the next string values: + + * *"numpy"*: Read data (structures arrays, arrays, records, scalars) will + be returned as NumPy objects. + * *"python"*: Read data will be returned as Python lists, tuples, or + scalars. + diff --git a/doc/source/usersguide/filenode.rst b/doc/source/usersguide/filenode.rst new file mode 100644 index 0000000..fc3049f --- /dev/null +++ b/doc/source/usersguide/filenode.rst @@ -0,0 +1,292 @@ +.. _filenode_usersguide: + +filenode - simulating a filesystem with PyTables +================================================ + +.. currentmodule:: tables.nodes.filenode + +What is filenode? +----------------- +filenode is a module which enables you to create a PyTables database of nodes +which can be used like regular opened files in Python. In other words, you +can store a file in a PyTables database, and read and write it as you would +do with any other file in Python. Used in conjunction with PyTables +hierarchical database organization, you can have your database turned into an +open, extensible, efficient, high capacity, portable and metadata-rich +filesystem for data exchange with other systems (including backup purposes). + +Between the main features of filenode, one can list: + +- *Open:* Since it relies on PyTables, which in turn, sits over HDF5 (see + :ref:`[HDGG1] `), a standard hierarchical data format from NCSA. + +- *Extensible:* You can define new types of nodes, and their instances will + be safely preserved (as are normal groups, leafs and attributes) by + PyTables applications having no knowledge of their types. Moreover, the set + of possible attributes for a node is not fixed, so you can define your own + node attributes. + +- *Efficient:* Thanks to PyTables' proven extreme efficiency on handling huge + amounts of data. filenode can make use of PyTables' on-the-fly compression + and decompression of data. + +- *High capacity:* Since PyTables and HDF5 are designed for massive data + storage (they use 64-bit addressing even where the platform does not + support it natively). + +- *Portable:* Since the HDF5 format has an architecture-neutral design, and + the HDF5 libraries and PyTables are known to run under a variety of + platforms. Besides that, a PyTables database fits into a single file, which + poses no trouble for transportation. + +- *Metadata-rich:* Since PyTables can store arbitrary key-value pairs (even + Python objects!) for every database node. Metadata may include authorship, + keywords, MIME types and encodings, ownership information, access control + lists (ACL), decoding functions and anything you can imagine! + + +Finding a filenode node +----------------------- +filenode nodes can be recognized because they have a NODE_TYPE system +attribute with a 'file' value. It is recommended that you use the +:meth:`File.get_node_attr` method of tables.File class to get the NODE_TYPE +attribute independently of the nature (group or leaf) of the node, so you do +not need to care about. + + +filenode - simulating files inside PyTables +------------------------------------------- +The filenode module is part of the nodes sub-package of PyTables. The +recommended way to import the module is:: + + >>> from tables.nodes import filenode + +However, filenode exports very few symbols, so you can import * for +interactive usage. In fact, you will most probably only use the NodeType +constant and the new_node() and open_node() calls. + +The NodeType constant contains the value that the NODE_TYPE system attribute +of a node file is expected to contain ('file', as we have seen). +Although this is not expected to change, you should use filenode.NodeType +instead of the literal 'file' when possible. + +new_node() and open_node() are the equivalent to the Python file() call (alias +open()) for ordinary files. Their arguments differ from that of file(), but +this is the only point where you will note the difference between working +with a node file and working with an ordinary file. + +For this little tutorial, we will assume that we have a PyTables database +opened for writing. Also, if you are somewhat lazy at typing sentences, the +code that we are going to explain is included in the examples/filenodes1.py +file. + +You can create a brand new file with these sentences:: + + >>> import tables + >>> h5file = tables.open_file('fnode.h5', 'w') + + +Creating a new file node +~~~~~~~~~~~~~~~~~~~~~~~~ +Creation of a new file node is achieved with the new_node() call. You must +tell it in which PyTables file you want to create it, where in the PyTables +hierarchy you want to create the node and which will be its name. The +PyTables file is the first argument to new_node(); it will be also called the +'host PyTables file'. The other two arguments must be given as keyword +arguments where and name, respectively. +As a result of the call, a brand new appendable and readable file node object +is returned. + +So let us create a new node file in the previously opened h5file PyTables +file, named 'fnode_test' and placed right under the root of the database +hierarchy. This is that command:: + + >>> fnode = filenode.new_node(h5file, where='/', name='fnode_test') + +That is basically all you need to create a file node. Simple, isn't it? From +that point on, you can use fnode as any opened Python file (i.e. you can +write data, read data, lines of text and so on). + +new_node() accepts some more keyword arguments. You can give a title to your +file with the title argument. You can use PyTables' compression features with +the filters argument. If you know beforehand the size that your file will +have, you can give its final file size in bytes to the expectedsize argument +so that the PyTables library would be able to optimize the data access. + +new_node() creates a PyTables node where it is told to. To prove it, we will +try to get the NODE_TYPE attribute from the newly created node:: + + >>> print(h5file.get_node_attr('/fnode_test', 'NODE_TYPE')) + file + + +Using a file node +~~~~~~~~~~~~~~~~~ +As stated above, you can use the new node file as any other opened file. Let +us try to write some text in and read it:: + + >>> print("This is a test text line.", file=fnode) + >>> print("And this is another one.", file=fnode) + >>> print(file=fnode) + >>> fnode.write("Of course, file methods can also be used.") + >>> + >>> fnode.seek(0) # Go back to the beginning of file. + >>> + >>> for line in fnode: + ... print(repr(line)) + 'This is a test text line.\\n' + 'And this is another one.\\n' + '\\n' + 'Of course, file methods can also be used.' + +This was run on a Unix system, so newlines are expressed as '\n'. In fact, +you can override the line separator for a file by setting its line_separator +property to any string you want. + +While using a file node, you should take care of closing it *before* you +close the PyTables host file. +Because of the way PyTables works, your data it will not be at a risk, but +every operation you execute after closing the host file will fail with a +ValueError. To close a file node, simply delete it or call its close() +method:: + + >>> fnode.close() + >>> print(fnode.closed) + True + + +Opening an existing file node +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +If you have a file node that you created using new_node(), you can open it +later by calling open_node(). Its arguments are similar to that of file() or +open(): the first argument is the PyTables node that you want to open (i.e. a +node with a NODE_TYPE attribute having a 'file' value), and the second +argument is a mode string indicating how to open the file. Contrary to +file(), open_node() can not be used to create a new file node. + +File nodes can be opened in read-only mode ('r') or in read-and-append mode +('a+'). Reading from a file node is allowed in both modes, but appending is +only allowed in the second one. Just like Python files do, writing data to an +appendable file places it after the file pointer if it is on or beyond the +end of the file, or otherwise after the existing data. Let us see an +example:: + + >>> node = h5file.root.fnode_test + >>> fnode = filenode.open_node(node, 'a+') + >>> print(repr(fnode.readline())) + 'This is a test text line.\\n' + >>> print(fnode.tell()) + 26 + >>> print("This is a new line.", file=fnode) + >>> print(repr(fnode.readline())) + '' + +Of course, the data append process places the pointer at the end of the file, +so the last readline() call hit EOF. Let us seek to the beginning of the file +to see the whole contents of our file:: + + >>> fnode.seek(0) + >>> for line in fnode: + ... print(repr(line)) + 'This is a test text line.\\n' + 'And this is another one.\\n' + '\\n' + 'Of course, file methods can also be used.This is a new line.\\n' + +As you can check, the last string we wrote was correctly appended at the end +of the file, instead of overwriting the second line, where the file pointer +was positioned by the time of the appending. + + +Adding metadata to a file node +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +You can associate arbitrary metadata to any open node file, regardless of its +mode, as long as the host PyTables file is writable. Of course, you could use +the set_node_attr() method of tables.File to do it directly on the proper node, +but filenode offers a much more comfortable way to do it. filenode objects +have an attrs property which gives you direct access to their corresponding +AttributeSet object. + +For instance, let us see how to associate MIME type metadata to our file +node:: + + >>> fnode.attrs.content_type = 'text/plain; charset=us-ascii' + +As simple as A-B-C. You can put nearly anything in an attribute, which opens +the way to authorship, keywords, permissions and more. Moreover, there is not +a fixed list of attributes. +However, you should avoid names in all caps or starting with '_', since +PyTables and filenode may use them internally. Some valid examples:: + + >>> fnode.attrs.author = "Ivan Vilata i Balaguer" + >>> fnode.attrs.creation_date = '2004-10-20T13:25:25+0200' + >>> fnode.attrs.keywords_en = ["FileNode", "test", "metadata"] + >>> fnode.attrs.keywords_ca = ["FileNode", "prova", "metadades"] + >>> fnode.attrs.owner = 'ivan' + >>> fnode.attrs.acl = {'ivan': 'rw', '@users': 'r'} + +You can check that these attributes get stored by running the ptdump command +on the host PyTables file. + +.. code-block:: bash + + $ ptdump -a fnode.h5:/fnode_test + /fnode_test (EArray(113,)) '' + /fnode_test.attrs (AttributeSet), 14 attributes: + [CLASS := 'EARRAY', + EXTDIM := 0, + FLAVOR := 'numpy', + NODE_TYPE := 'file', + NODE_TYPE_VERSION := 2, + TITLE := '', + VERSION := '1.2', + acl := {'ivan': 'rw', '@users': 'r'}, + author := 'Ivan Vilata i Balaguer', + content_type := 'text/plain; charset=us-ascii', + creation_date := '2004-10-20T13:25:25+0200', + keywords_ca := ['FileNode', 'prova', 'metadades'], + keywords_en := ['FileNode', 'test', 'metadata'], + owner := 'ivan'] + +Note that filenode makes no assumptions about the meaning of your metadata, +so its handling is entirely left to your needs and imagination. + + +Complementary notes +------------------- +You can use file nodes and PyTables groups to mimic a filesystem with files +and directories. Since you can store nearly anything you want as file +metadata, this enables you to use a PyTables file as a portable compressed +backup, even between radically different platforms. Take this with a grain of +salt, since node files are restricted in their naming (only valid Python +identifiers are valid); however, remember that you can use node titles and +metadata to overcome this limitation. Also, you may need to devise some +strategy to represent special files such as devices, sockets and such (not +necessarily using filenode). + +We are eager to hear your opinion about filenode and its potential uses. +Suggestions to improve filenode and create other node types are also welcome. +Do not hesitate to contact us! + + +Current limitations +------------------- +filenode is still a young piece of software, so it lacks some functionality. +This is a list of known current limitations: + +#. Node files can only be opened for read-only or read and append mode. This + should be enhanced in the future. +#. Near future? +#. Only binary I/O is supported currently (read/write strings of bytes) +#. There is no universal newline support yet. The only new-line character + used at the moment is ``\n``. This is likely to be improved in a near + future. +#. Sparse files (files with lots of zeros) are not treated specially; if you + want them to take less space, you should be better off using compression. + +These limitations still make filenode entirely adequate to work with most +binary and text files. Of course, suggestions and patches are welcome. + +See :ref:`filenode_classes` for detailed documentation on the filenode +interface. + diff --git a/doc/source/usersguide/images/Q7-10m-noidx.png b/doc/source/usersguide/images/Q7-10m-noidx.png new file mode 100644 index 0000000..a84085b Binary files /dev/null and b/doc/source/usersguide/images/Q7-10m-noidx.png differ diff --git a/doc/source/usersguide/images/Q7-10m-noidx.svg b/doc/source/usersguide/images/Q7-10m-noidx.svg new file mode 100644 index 0000000..2683ea3 --- /dev/null +++ b/doc/source/usersguide/images/Q7-10m-noidx.svg @@ -0,0 +1,1264 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/doc/source/usersguide/images/Q8-1g-idx-SSD.png b/doc/source/usersguide/images/Q8-1g-idx-SSD.png new file mode 100644 index 0000000..a602651 Binary files /dev/null and b/doc/source/usersguide/images/Q8-1g-idx-SSD.png differ diff --git a/doc/source/usersguide/images/Q8-1g-idx-SSD.svg b/doc/source/usersguide/images/Q8-1g-idx-SSD.svg new file mode 100644 index 0000000..bd40c26 --- /dev/null +++ b/doc/source/usersguide/images/Q8-1g-idx-SSD.svg @@ -0,0 +1,1442 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/doc/source/usersguide/images/Q8-1g-idx-compress.png b/doc/source/usersguide/images/Q8-1g-idx-compress.png new file mode 100644 index 0000000..ac73192 Binary files /dev/null and b/doc/source/usersguide/images/Q8-1g-idx-compress.png differ diff --git a/doc/source/usersguide/images/Q8-1g-idx-compress.svg b/doc/source/usersguide/images/Q8-1g-idx-compress.svg new file mode 100644 index 0000000..6ccb4c6 --- /dev/null +++ b/doc/source/usersguide/images/Q8-1g-idx-compress.svg @@ -0,0 +1,1543 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/doc/source/usersguide/images/Q8-1g-idx-optlevels.png b/doc/source/usersguide/images/Q8-1g-idx-optlevels.png new file mode 100644 index 0000000..37b97dc Binary files /dev/null and b/doc/source/usersguide/images/Q8-1g-idx-optlevels.png differ diff --git a/doc/source/usersguide/images/Q8-1g-idx-optlevels.svg b/doc/source/usersguide/images/Q8-1g-idx-optlevels.svg new file mode 100644 index 0000000..f76ad16 --- /dev/null +++ b/doc/source/usersguide/images/Q8-1g-idx-optlevels.svg @@ -0,0 +1,1630 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/doc/source/usersguide/images/Q8-1g-idx-sorted.png b/doc/source/usersguide/images/Q8-1g-idx-sorted.png new file mode 100644 index 0000000..a782b57 Binary files /dev/null and b/doc/source/usersguide/images/Q8-1g-idx-sorted.png differ diff --git a/doc/source/usersguide/images/Q8-1g-idx-sorted.svg b/doc/source/usersguide/images/Q8-1g-idx-sorted.svg new file mode 100644 index 0000000..b218d31 --- /dev/null +++ b/doc/source/usersguide/images/Q8-1g-idx-sorted.svg @@ -0,0 +1,5403 @@ + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/doc/source/usersguide/images/Q8-1g-noidx.png b/doc/source/usersguide/images/Q8-1g-noidx.png new file mode 100644 index 0000000..0a8e99b Binary files /dev/null and b/doc/source/usersguide/images/Q8-1g-noidx.png differ diff --git a/doc/source/usersguide/images/Q8-1g-noidx.svg b/doc/source/usersguide/images/Q8-1g-noidx.svg new file mode 100644 index 0000000..56818df --- /dev/null +++ b/doc/source/usersguide/images/Q8-1g-noidx.svg @@ -0,0 +1,1345 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/doc/source/usersguide/images/compressed-recordsize-shuffle.png b/doc/source/usersguide/images/compressed-recordsize-shuffle.png new file mode 100644 index 0000000..030b0bc Binary files /dev/null and b/doc/source/usersguide/images/compressed-recordsize-shuffle.png differ diff --git a/doc/source/usersguide/images/compressed-recordsize-shuffle.svg b/doc/source/usersguide/images/compressed-recordsize-shuffle.svg new file mode 100644 index 0000000..1129f34 --- /dev/null +++ b/doc/source/usersguide/images/compressed-recordsize-shuffle.svg @@ -0,0 +1,285 @@ + + + + + + + + + + + + + + + + + + + + +Disk space taken by a record (original record size: 16 bytes) + + +10 +3 + + + + + +10 +4 + + + + + +10 +5 + + + + + +10 +6 + + + + + +10 +7 + + +10 +8 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Number of rows + + + +0 + + + + + +5 + + + + + +10 + + + + + +15 + + + + + +20 + + + + + +25 + + +30 + +Bytes/row + + + + + + + + + + +No compression +zlib lvl1 +zlib lvl1 (Shuffle) +lzo lvl1 +lzo lvl1 (Shuffle) +bzip2 lvl1 +bzip2 lvl1 (Shuffle) + + + + diff --git a/doc/source/usersguide/images/compressed-recordsize-zlib.png b/doc/source/usersguide/images/compressed-recordsize-zlib.png new file mode 100644 index 0000000..44d006f Binary files /dev/null and b/doc/source/usersguide/images/compressed-recordsize-zlib.png differ diff --git a/doc/source/usersguide/images/compressed-recordsize-zlib.svg b/doc/source/usersguide/images/compressed-recordsize-zlib.svg new file mode 100644 index 0000000..cfcdb08 --- /dev/null +++ b/doc/source/usersguide/images/compressed-recordsize-zlib.svg @@ -0,0 +1,273 @@ + + + + + + + + + + + + + + + + + + +Disk space taken by a record (original record size: 16 bytes) + + +10 +3 + + + + + +10 +4 + + + + + +10 +5 + + + + + +10 +6 + + + + + +10 +7 + + +10 +8 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Number of rows + + + +5 + + + + + +10 + + + + + +15 + + + + + +20 + + + + + +25 + + +30 + +Bytes/row + + + + + + + + +No compression +zlib lvl1 +zlib lvl3 +zlib lvl6 +zlib lvl9 + + + + diff --git a/doc/source/usersguide/images/compressed-recordsize.png b/doc/source/usersguide/images/compressed-recordsize.png new file mode 100644 index 0000000..6b75aaf Binary files /dev/null and b/doc/source/usersguide/images/compressed-recordsize.png differ diff --git a/doc/source/usersguide/images/compressed-recordsize.svg b/doc/source/usersguide/images/compressed-recordsize.svg new file mode 100644 index 0000000..da304c5 --- /dev/null +++ b/doc/source/usersguide/images/compressed-recordsize.svg @@ -0,0 +1,270 @@ + + + + + + + + + + + + + + + + + +Disk space taken by a record (original record size: 16 bytes) + + +10 +3 + + + + + +10 +4 + + + + + +10 +5 + + + + + +10 +6 + + + + + +10 +7 + + +10 +8 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Number of rows + + + +5 + + + + + +10 + + + + + +15 + + + + + +20 + + + + + +25 + + +30 + +Bytes/row + + + + + + + +No compression +zlib lvl1 +lzo lvl1 +bzip2 lvl1 + + + + diff --git a/doc/source/usersguide/images/compressed-select-cache-shuffle-only.svg b/doc/source/usersguide/images/compressed-select-cache-shuffle-only.svg new file mode 100644 index 0000000..e691c46 --- /dev/null +++ b/doc/source/usersguide/images/compressed-select-cache-shuffle-only.svg @@ -0,0 +1,288 @@ + + + + + + + + + + + + + + + + + +Selecting with small (16 bytes) record size (file in cache) + + +10 +3 + + + + + +10 +4 + + + + + +10 +5 + + + + + +10 +6 + + + + + +10 +7 + + +10 +8 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Number of rows + + + +0 + + + + + +2 + + + + + +4 + + + + + +6 + + + + + +8 + + + + + +10 + + + + + +12 + + + + + +14 + + +16 + +MRows/s + + + + + + + +No compression +zlib lvl1 (Shuffle) +lzo lvl1 (Shuffle) +bzip2 lvl1 (Shuffle) + + + + diff --git a/doc/source/usersguide/images/compressed-select-cache-shuffle.png b/doc/source/usersguide/images/compressed-select-cache-shuffle.png new file mode 100644 index 0000000..c9840b0 Binary files /dev/null and b/doc/source/usersguide/images/compressed-select-cache-shuffle.png differ diff --git a/doc/source/usersguide/images/compressed-select-cache-shuffle.svg b/doc/source/usersguide/images/compressed-select-cache-shuffle.svg new file mode 100644 index 0000000..c8f3a89 --- /dev/null +++ b/doc/source/usersguide/images/compressed-select-cache-shuffle.svg @@ -0,0 +1,297 @@ + + + + + + + + + + + + + + + + + + + + +Selecting with small (16 bytes) record size (file in cache) + + +10 +3 + + + + + +10 +4 + + + + + +10 +5 + + + + + +10 +6 + + + + + +10 +7 + + +10 +8 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Number of rows + + + +0 + + + + + +2 + + + + + +4 + + + + + +6 + + + + + +8 + + + + + +10 + + + + + +12 + + + + + +14 + + +16 + +MRows/s + + + + + + + + + + +No compression +zlib lvl1 +zlib lvl1 (Shuffle) +lzo lvl1 +lzo lvl1 (Shuffle) +bzip2 lvl1 +bzip2 lvl1 (Shuffle) + + + + diff --git a/doc/source/usersguide/images/compressed-select-cache-zlib.png b/doc/source/usersguide/images/compressed-select-cache-zlib.png new file mode 100644 index 0000000..f499a3d Binary files /dev/null and b/doc/source/usersguide/images/compressed-select-cache-zlib.png differ diff --git a/doc/source/usersguide/images/compressed-select-cache-zlib.svg b/doc/source/usersguide/images/compressed-select-cache-zlib.svg new file mode 100644 index 0000000..df92794 --- /dev/null +++ b/doc/source/usersguide/images/compressed-select-cache-zlib.svg @@ -0,0 +1,291 @@ + + + + + + + + + + + + + + + + + + +Selecting with small (16 bytes) record size (file in cache) + + +10 +3 + + + + + +10 +4 + + + + + +10 +5 + + + + + +10 +6 + + + + + +10 +7 + + +10 +8 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Number of rows + + + +0 + + + + + +2 + + + + + +4 + + + + + +6 + + + + + +8 + + + + + +10 + + + + + +12 + + + + + +14 + + +16 + +MRows/s + + + + + + + + +No compression +zlib lvl1 +zlib lvl3 +zlib lvl6 +zlib lvl9 + + + + diff --git a/doc/source/usersguide/images/compressed-select-cache.png b/doc/source/usersguide/images/compressed-select-cache.png new file mode 100644 index 0000000..1d4caec Binary files /dev/null and b/doc/source/usersguide/images/compressed-select-cache.png differ diff --git a/doc/source/usersguide/images/compressed-select-cache.svg b/doc/source/usersguide/images/compressed-select-cache.svg new file mode 100644 index 0000000..c63fff1 --- /dev/null +++ b/doc/source/usersguide/images/compressed-select-cache.svg @@ -0,0 +1,288 @@ + + + + + + + + + + + + + + + + + +Selecting with small (16 bytes) record size (file in cache) + + +10 +3 + + + + + +10 +4 + + + + + +10 +5 + + + + + +10 +6 + + + + + +10 +7 + + +10 +8 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Number of rows + + + +0 + + + + + +2 + + + + + +4 + + + + + +6 + + + + + +8 + + + + + +10 + + + + + +12 + + + + + +14 + + +16 + +MRows/s + + + + + + + +No compression +zlib lvl1 +lzo lvl1 +bzip2 lvl1 + + + + diff --git a/doc/source/usersguide/images/compressed-select-nocache-shuffle-only.png b/doc/source/usersguide/images/compressed-select-nocache-shuffle-only.png new file mode 100644 index 0000000..a1eb310 Binary files /dev/null and b/doc/source/usersguide/images/compressed-select-nocache-shuffle-only.png differ diff --git a/doc/source/usersguide/images/compressed-select-nocache-shuffle-only.svg b/doc/source/usersguide/images/compressed-select-nocache-shuffle-only.svg new file mode 100644 index 0000000..778a936 --- /dev/null +++ b/doc/source/usersguide/images/compressed-select-nocache-shuffle-only.svg @@ -0,0 +1,282 @@ + + + + + + + + + + + + + + + + + +Selecting with small (16 bytes) record size (file not in cache) + + +10 +3 + + + + + +10 +4 + + + + + +10 +5 + + + + + +10 +6 + + + + + +10 +7 + + +10 +8 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Number of rows + + + +0 + + + + + +1 + + + + + +2 + + + + + +3 + + + + + +4 + + + + + +5 + + + + + +6 + + +7 + +MRows/s + + + + + + + +No compression +zlib lvl1 (Shuffle) +lzo lvl1 (Shuffle) +bzip2 lvl1 (Shuffle) + + + + diff --git a/doc/source/usersguide/images/compressed-select-nocache-shuffle.svg b/doc/source/usersguide/images/compressed-select-nocache-shuffle.svg new file mode 100644 index 0000000..5b92f60 --- /dev/null +++ b/doc/source/usersguide/images/compressed-select-nocache-shuffle.svg @@ -0,0 +1,291 @@ + + + + + + + + + + + + + + + + + + + + +Selecting with small (16 bytes) record size (file not in cache) + + +10 +3 + + + + + +10 +4 + + + + + +10 +5 + + + + + +10 +6 + + + + + +10 +7 + + +10 +8 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Number of rows + + + +0 + + + + + +1 + + + + + +2 + + + + + +3 + + + + + +4 + + + + + +5 + + + + + +6 + + +7 + +MRows/s + + + + + + + + + + +No compression +zlib lvl1 +zlib lvl1 (Shuffle) +lzo lvl1 +lzo lvl1 (Shuffle) +bzip2 lvl1 +bzip2 lvl1 (Shuffle) + + + + diff --git a/doc/source/usersguide/images/compressed-select-nocache.png b/doc/source/usersguide/images/compressed-select-nocache.png new file mode 100644 index 0000000..80f002d Binary files /dev/null and b/doc/source/usersguide/images/compressed-select-nocache.png differ diff --git a/doc/source/usersguide/images/compressed-select-nocache.svg b/doc/source/usersguide/images/compressed-select-nocache.svg new file mode 100644 index 0000000..a9b6bbd --- /dev/null +++ b/doc/source/usersguide/images/compressed-select-nocache.svg @@ -0,0 +1,288 @@ + + + + + + + + + + + + + + + + + +Selecting with small (16 bytes) record size (file not in cache) + + +10 +3 + + + + + +10 +4 + + + + + +10 +5 + + + + + +10 +6 + + + + + +10 +7 + + +10 +8 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Number of rows + + + +0.0 + + + + + +0.5 + + + + + +1.0 + + + + + +1.5 + + + + + +2.0 + + + + + +2.5 + + + + + +3.0 + + + + + +3.5 + + +4.0 + +MRows/s + + + + + + + +No compression +zlib lvl1 +lzo lvl1 +bzip2 lvl1 + + + + diff --git a/doc/source/usersguide/images/compressed-writing-shuffle-only.svg b/doc/source/usersguide/images/compressed-writing-shuffle-only.svg new file mode 100644 index 0000000..463e447 --- /dev/null +++ b/doc/source/usersguide/images/compressed-writing-shuffle-only.svg @@ -0,0 +1,276 @@ + + + + + + + + + + + + + + + + + +Writing with small (16 bytes) record size + + +10 +3 + + + + + +10 +4 + + + + + +10 +5 + + + + + +10 +6 + + + + + +10 +7 + + +10 +8 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Number of rows + + + +0.0 + + + + + +0.5 + + + + + +1.0 + + + + + +1.5 + + + + + +2.0 + + + + + +2.5 + + +3.0 + +MRows/s + + + + + + + +No compression +zlib lvl1 (Shuffle) +lzo lvl1 (Shuffle) +bzip2 lvl1 (Shuffle) + + + + diff --git a/doc/source/usersguide/images/compressed-writing-shuffle.png b/doc/source/usersguide/images/compressed-writing-shuffle.png new file mode 100644 index 0000000..a6ca2dc Binary files /dev/null and b/doc/source/usersguide/images/compressed-writing-shuffle.png differ diff --git a/doc/source/usersguide/images/compressed-writing-shuffle.svg b/doc/source/usersguide/images/compressed-writing-shuffle.svg new file mode 100644 index 0000000..56f333e --- /dev/null +++ b/doc/source/usersguide/images/compressed-writing-shuffle.svg @@ -0,0 +1,285 @@ + + + + + + + + + + + + + + + + + + + + +Writing with small (16 bytes) record size + + +10 +3 + + + + + +10 +4 + + + + + +10 +5 + + + + + +10 +6 + + + + + +10 +7 + + +10 +8 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Number of rows + + + +0.0 + + + + + +0.5 + + + + + +1.0 + + + + + +1.5 + + + + + +2.0 + + + + + +2.5 + + +3.0 + +MRows/s + + + + + + + + + + +No compression +zlib lvl1 +zlib lvl1 (Shuffle) +lzo lvl1 +lzo lvl1 (Shuffle) +bzip2 lvl1 +bzip2 lvl1 (Shuffle) + + + + diff --git a/doc/source/usersguide/images/compressed-writing-zlib.png b/doc/source/usersguide/images/compressed-writing-zlib.png new file mode 100644 index 0000000..14907e7 Binary files /dev/null and b/doc/source/usersguide/images/compressed-writing-zlib.png differ diff --git a/doc/source/usersguide/images/compressed-writing-zlib.svg b/doc/source/usersguide/images/compressed-writing-zlib.svg new file mode 100644 index 0000000..4c5a03d --- /dev/null +++ b/doc/source/usersguide/images/compressed-writing-zlib.svg @@ -0,0 +1,279 @@ + + + + + + + + + + + + + + + + + + +Writing with small (16 bytes) record size + + +10 +3 + + + + + +10 +4 + + + + + +10 +5 + + + + + +10 +6 + + + + + +10 +7 + + +10 +8 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Number of rows + + + +0.0 + + + + + +0.5 + + + + + +1.0 + + + + + +1.5 + + + + + +2.0 + + + + + +2.5 + + +3.0 + +MRows/s + + + + + + + + +No compression +zlib lvl1 +zlib lvl3 +zlib lvl6 +zlib lvl9 + + + + diff --git a/doc/source/usersguide/images/compressed-writing.png b/doc/source/usersguide/images/compressed-writing.png new file mode 100644 index 0000000..1d3e66f Binary files /dev/null and b/doc/source/usersguide/images/compressed-writing.png differ diff --git a/doc/source/usersguide/images/compressed-writing.svg b/doc/source/usersguide/images/compressed-writing.svg new file mode 100644 index 0000000..faab9ef --- /dev/null +++ b/doc/source/usersguide/images/compressed-writing.svg @@ -0,0 +1,276 @@ + + + + + + + + + + + + + + + + + +Writing with small (16 bytes) record size + + +10 +3 + + + + + +10 +4 + + + + + +10 +5 + + + + + +10 +6 + + + + + +10 +7 + + +10 +8 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Number of rows + + + +0.0 + + + + + +0.5 + + + + + +1.0 + + + + + +1.5 + + + + + +2.0 + + + + + +2.5 + + +3.0 + +MRows/s + + + + + + + +No compression +zlib lvl1 +lzo lvl1 +bzip2 lvl1 + + + + diff --git a/doc/source/usersguide/images/create-chunksize-15GB.png b/doc/source/usersguide/images/create-chunksize-15GB.png new file mode 100644 index 0000000..a022d32 Binary files /dev/null and b/doc/source/usersguide/images/create-chunksize-15GB.png differ diff --git a/doc/source/usersguide/images/create-chunksize-15GB.svg b/doc/source/usersguide/images/create-chunksize-15GB.svg new file mode 100644 index 0000000..261e18e --- /dev/null +++ b/doc/source/usersguide/images/create-chunksize-15GB.svg @@ -0,0 +1,2906 @@ + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Automaticchunksize + + diff --git a/doc/source/usersguide/images/create-index-time-int32-float64.png b/doc/source/usersguide/images/create-index-time-int32-float64.png new file mode 100644 index 0000000..e3aaad4 Binary files /dev/null and b/doc/source/usersguide/images/create-index-time-int32-float64.png differ diff --git a/doc/source/usersguide/images/create-index-time-int32-float64.svg b/doc/source/usersguide/images/create-index-time-int32-float64.svg new file mode 100644 index 0000000..5e2a70b --- /dev/null +++ b/doc/source/usersguide/images/create-index-time-int32-float64.svg @@ -0,0 +1,1707 @@ + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 2.1xfaster + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 1.5xfaster + + + + 25x faster + + 21x faster + diff --git a/doc/source/usersguide/images/filesizes-chunksize-15GB.png b/doc/source/usersguide/images/filesizes-chunksize-15GB.png new file mode 100644 index 0000000..88444b4 Binary files /dev/null and b/doc/source/usersguide/images/filesizes-chunksize-15GB.png differ diff --git a/doc/source/usersguide/images/filesizes-chunksize-15GB.svg b/doc/source/usersguide/images/filesizes-chunksize-15GB.svg new file mode 100644 index 0000000..34a0a01 --- /dev/null +++ b/doc/source/usersguide/images/filesizes-chunksize-15GB.svg @@ -0,0 +1,2872 @@ + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Automaticchunksize + + diff --git a/doc/source/usersguide/images/indexes-sizes2.png b/doc/source/usersguide/images/indexes-sizes2.png new file mode 100644 index 0000000..4947a9f Binary files /dev/null and b/doc/source/usersguide/images/indexes-sizes2.png differ diff --git a/doc/source/usersguide/images/indexes-sizes2.svg b/doc/source/usersguide/images/indexes-sizes2.svg new file mode 100644 index 0000000..ee66f5c --- /dev/null +++ b/doc/source/usersguide/images/indexes-sizes2.svg @@ -0,0 +1,1447 @@ + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Full + PostgreSQL + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 23.4 MB + + + Sizes for index of a 1 Grow column with different optimizations (PyTables Pro 2.1 vs PostgreSQL 8.3.1) + Originalcolumn + + 1.49 MB + + 15x lighter + + 3.4xlighter + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/doc/source/usersguide/images/objecttree-h5.png b/doc/source/usersguide/images/objecttree-h5.png new file mode 100644 index 0000000..2ae2f28 Binary files /dev/null and b/doc/source/usersguide/images/objecttree-h5.png differ diff --git a/doc/source/usersguide/images/objecttree.dia b/doc/source/usersguide/images/objecttree.dia new file mode 100644 index 0000000..e99f604 Binary files /dev/null and b/doc/source/usersguide/images/objecttree.dia differ diff --git a/doc/source/usersguide/images/objecttree.pdf b/doc/source/usersguide/images/objecttree.pdf new file mode 100644 index 0000000..9816bb5 Binary files /dev/null and b/doc/source/usersguide/images/objecttree.pdf differ diff --git a/doc/source/usersguide/images/objecttree.png b/doc/source/usersguide/images/objecttree.png new file mode 100644 index 0000000..020412e Binary files /dev/null and b/doc/source/usersguide/images/objecttree.png differ diff --git a/doc/source/usersguide/images/objecttree.svg b/doc/source/usersguide/images/objecttree.svg new file mode 100644 index 0000000..11b3acc --- /dev/null +++ b/doc/source/usersguide/images/objecttree.svg @@ -0,0 +1,828 @@ + + + + + + image/svg+xml + + + + + + + + + + + +name: string = "objecttree.h5" + +root: Group = rootGroupObject + + + +create_group(where:Group,name:string): Group + +create_table(where:Group,name:string,description:IsDescription): Table + +create_array(where:Group,name:string,object:array): Array + +close() + + + + + +_v_name: string = "/" + +group1: Group = groupObject1 + +group2: Group = groupObject2 + +array1: Array = arrayObject1 + + + + + +_v_name: string = "group1" + +table1: Table = tableObject1 + +array2: Array = arrayObject2 + + + + + +_v_name: string = "group2" + +table2: Table = tableObject2 + + + + + +['identity']: string + +['idnumber']: int16 + +['speed']: int32 + +nrow: int64 + + + +append() + + + + + +name: string = "table2" + +row: Row = rowObject2 + + + +read(): table + + + + + +name: string = "array1" + + + +read(): array + + + + + + + + + + + + + + + + + + + + + + + +name: string = "table1" + +row: Row = rowObject1 + + + +read(): table + + + + + +['identity']: string + +['idnumber']: int16 + +['speed']: float32 + +nrow: int64 + + + +append() + + + + + +name: string = "array2" + + + +read(): array + fileObject(File) + rootGroupObject(Group) + arrayObject1(Array) + groupObject2(Group) + groupObject1(Group) + tableObject1(Table) + rowObject1(Row) + arrayObject2(Array) + tableObject2(Table) + rowObject2(Row) + diff --git a/doc/source/usersguide/images/pytables-front-logo.pdf b/doc/source/usersguide/images/pytables-front-logo.pdf new file mode 100644 index 0000000..ce38dd0 Binary files /dev/null and b/doc/source/usersguide/images/pytables-front-logo.pdf differ diff --git a/doc/source/usersguide/images/pytables-front-logo.svg b/doc/source/usersguide/images/pytables-front-logo.svg new file mode 100644 index 0000000..8cf2fc6 --- /dev/null +++ b/doc/source/usersguide/images/pytables-front-logo.svg @@ -0,0 +1,222 @@ + + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Hierarchical datasets in Python + + diff --git a/doc/source/usersguide/images/query-time-nhits-cold-cache-float64.svg b/doc/source/usersguide/images/query-time-nhits-cold-cache-float64.svg new file mode 100644 index 0000000..574bd10 --- /dev/null +++ b/doc/source/usersguide/images/query-time-nhits-cold-cache-float64.svg @@ -0,0 +1,475 @@ + + + + + + + + + + + + + + + + + +10 +0 + + + + + +10 +1 + + + + + +10 +2 + + + + + +10 +3 + + + + + +10 +4 + + + + + +10 +5 + + + + + +10 +6 + + +10 +7 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Number of hits + + + +10 +-2 + + + + + +10 +-1 + + + + + +10 +0 + + + + + +10 +1 + + + + + +10 +2 + + + + + +10 +3 + + + + + +10 +4 + + +10 +5 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Time (s) + + +Query time for an indexed table with 1 gigarow (cold cache) + + + + + PyTables Pro + Postgres + + + + diff --git a/doc/source/usersguide/images/query-time-repeated-query-float64.svg b/doc/source/usersguide/images/query-time-repeated-query-float64.svg new file mode 100644 index 0000000..3f3621f --- /dev/null +++ b/doc/source/usersguide/images/query-time-repeated-query-float64.svg @@ -0,0 +1,342 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +10 +3 + + + + + +10 +4 + + + + + +10 +5 + + + + + +10 +6 + + + + + +10 +7 + + + + + +10 +8 + + + + + +10 +9 + + +10 +10 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Number of rows + + + + + + +1 + + + + + +2 + + + + + +3 + + + + + +4 + + +5 + +Time (s) +x1e-4 + + +Query time for Float64 column (repeated query) + + + + + + + + + + + + + + + + + + + + + + + +PyTables Pro O0 +PyTables Pro O3 +PyTables Pro O6 +PyTables Pro O9 +Postgres + + + + diff --git a/doc/source/usersguide/images/random-chunksize-15GB.png b/doc/source/usersguide/images/random-chunksize-15GB.png new file mode 100644 index 0000000..464394c Binary files /dev/null and b/doc/source/usersguide/images/random-chunksize-15GB.png differ diff --git a/doc/source/usersguide/images/random-chunksize-15GB.svg b/doc/source/usersguide/images/random-chunksize-15GB.svg new file mode 100644 index 0000000..076957f --- /dev/null +++ b/doc/source/usersguide/images/random-chunksize-15GB.svg @@ -0,0 +1,2696 @@ + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Automaticchunksize + + diff --git a/doc/source/usersguide/images/read-medium-psyco-nopsyco-comparison.png b/doc/source/usersguide/images/read-medium-psyco-nopsyco-comparison.png new file mode 100644 index 0000000..a84a756 Binary files /dev/null and b/doc/source/usersguide/images/read-medium-psyco-nopsyco-comparison.png differ diff --git a/doc/source/usersguide/images/read-medium-psyco-nopsyco-comparison.svg b/doc/source/usersguide/images/read-medium-psyco-nopsyco-comparison.svg new file mode 100644 index 0000000..dfa2c13 --- /dev/null +++ b/doc/source/usersguide/images/read-medium-psyco-nopsyco-comparison.svg @@ -0,0 +1,1925 @@ + + + + + SVG drawing + This was produced by version 4.1 of GNU libplot, a free library for exporting 2-D vector graphics. + + + + + 0 + + + + 200 + + + + 400 + + + + 600 + + + + 800 + + + + 1000 + + + + 1200 + + + + 1000 + + + + + + + + + + 10000 + + + + + + + + + + 100000 + + + + + + + + + + 1e+06 + + + + + + + + + + 1e+07 + + +Speed (Krow/s) + + +Number of rows + +Selecting with medium record size (56 bytes) + +No Psyco + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Psyco + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/doc/source/usersguide/images/seq-chunksize-15GB.png b/doc/source/usersguide/images/seq-chunksize-15GB.png new file mode 100644 index 0000000..dd017fa Binary files /dev/null and b/doc/source/usersguide/images/seq-chunksize-15GB.png differ diff --git a/doc/source/usersguide/images/seq-chunksize-15GB.svg b/doc/source/usersguide/images/seq-chunksize-15GB.svg new file mode 100644 index 0000000..beefcbc --- /dev/null +++ b/doc/source/usersguide/images/seq-chunksize-15GB.svg @@ -0,0 +1,2815 @@ + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Automaticchunksize + + diff --git a/doc/source/usersguide/images/tutorial1-1-tableview.png b/doc/source/usersguide/images/tutorial1-1-tableview.png new file mode 100644 index 0000000..abd532e Binary files /dev/null and b/doc/source/usersguide/images/tutorial1-1-tableview.png differ diff --git a/doc/source/usersguide/images/tutorial1-2-tableview.png b/doc/source/usersguide/images/tutorial1-2-tableview.png new file mode 100644 index 0000000..e792902 Binary files /dev/null and b/doc/source/usersguide/images/tutorial1-2-tableview.png differ diff --git a/doc/source/usersguide/images/tutorial1-general.png b/doc/source/usersguide/images/tutorial1-general.png new file mode 100644 index 0000000..45d6687 Binary files /dev/null and b/doc/source/usersguide/images/tutorial1-general.png differ diff --git a/doc/source/usersguide/images/tutorial2-tableview.png b/doc/source/usersguide/images/tutorial2-tableview.png new file mode 100644 index 0000000..d0390b3 Binary files /dev/null and b/doc/source/usersguide/images/tutorial2-tableview.png differ diff --git a/doc/source/usersguide/images/write-medium-psyco-nopsyco-comparison.png b/doc/source/usersguide/images/write-medium-psyco-nopsyco-comparison.png new file mode 100644 index 0000000..0a1d795 Binary files /dev/null and b/doc/source/usersguide/images/write-medium-psyco-nopsyco-comparison.png differ diff --git a/doc/source/usersguide/images/write-medium-psyco-nopsyco-comparison.svg b/doc/source/usersguide/images/write-medium-psyco-nopsyco-comparison.svg new file mode 100644 index 0000000..8da79ad --- /dev/null +++ b/doc/source/usersguide/images/write-medium-psyco-nopsyco-comparison.svg @@ -0,0 +1,1906 @@ + + + + + SVG drawing + This was produced by version 4.1 of GNU libplot, a free library for exporting 2-D vector graphics. + + + + + 0 + + + + 50 + + + + 100 + + + + 150 + + + + 200 + + + + 250 + + + + 1000 + + + + + + + + + + 10000 + + + + + + + + + + 100000 + + + + + + + + + + 1e+06 + + + + + + + + + + 1e+07 + + +Speed (Krow/s) + + +Number of rows + +Writing with medium record size (56 bytes) + +No Psyco + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Psyco + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/doc/source/usersguide/index.rst b/doc/source/usersguide/index.rst new file mode 100644 index 0000000..3f808dd --- /dev/null +++ b/doc/source/usersguide/index.rst @@ -0,0 +1,68 @@ +===================== +PyTables User's Guide +===================== +------------------------------- +Hierarchical datasets in Python +------------------------------- + +:Authors: **Francesc Alted, Ivan Vilata, Scott Prater, Vicent Mas, Tom Hedley, + Antonio Valentino, Jeffrey Whitaker, Anthony Scopatz, Josh Moore** +:Copyright: |copy| 2002, 2003, 2004 - Francesc Alted + + |copy| 2005, 2006, 2007 - Cárabos Coop. V. + + |copy| 2008, 2009, 2010 - Francesc Alted + + |copy| 2011–2021 - PyTables maintainers + +-------- +Contents +-------- +.. toctree:: + :maxdepth: 1 + + introduction + installation + tutorials + libref + optimization + filenode + datatypes + condition_syntax + parameter_files + utilities + file_format + bibliography + +-------------------------------------------------------- +Copyright Notice and Statement for PyTables User's Guide +-------------------------------------------------------- +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +a. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +b. Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. + +c. Neither the name of Francesc Alted nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND +CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, +BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND +FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT +NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF +THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +.. |copy| unicode:: U+000A9 .. COPYRIGHT SIGN diff --git a/doc/source/usersguide/installation.rst b/doc/source/usersguide/installation.rst new file mode 100644 index 0000000..3bc6596 --- /dev/null +++ b/doc/source/usersguide/installation.rst @@ -0,0 +1,524 @@ +Installation +============ + +.. epigraph:: + + Make things as simple as possible, but not any simpler. + + -- Albert Einstein + + +The Python Distutils are used to build and install PyTables, so it is fairly +simple to get the application up and running. If you want to install the +package from sources you can go on reading to the next section. + +However, if you want to go straight to binaries that 'just work' for the main +platforms (Linux, Mac OSX and Windows), you might want to use the excellent +Anaconda_, ActivePython_, Canopy_ distributions. PyTables usually distributes its own +Windows binaries too; go :ref:`binaryInstallationDescr` for instructions. +Finally `Christoph Gohlke`_ also maintains an excellent suite of a variety of +binary packages for Windows at his site. + +.. _Anaconda: https://store.continuum.io/cshop/anaconda/ +.. _Canopy: https://www.enthought.com/products/canopy/ +.. _ActivePython: https://www.activestate.com/activepython/downloads +.. _`Christoph Gohlke`: http://www.lfd.uci.edu/~gohlke/pythonlibs/ + + +Installation from source +------------------------ + +These instructions are for both Unix/MacOS X and Windows systems. If you are +using Windows, it is assumed that you have a recent version of MS Visual C++ +compiler installed. +A GCC compiler is assumed for Unix, but other compilers should work as well. + +Extensions in PyTables have been developed in Cython (see +:ref:`[CYTHON] `) and the C language. You can rebuild everything from +scratch if you have Cython installed, but this is not necessary, as the Cython +compiled source is included in the source distribution. + +To compile PyTables you will need a recent version of Python, the HDF5 (C +flavor) library from http://www.hdfgroup.org, and the NumPy (see +:ref:`[NUMPY] `) and Numexpr (see :ref:`[NUMEXPR] `) +packages. + + +Prerequisites +~~~~~~~~~~~~~ + +First, make sure that you have + +* Python_ >= 3.6 (PyTables-3.5 was the last release with Python 2.7 support) +* HDF5_ >= 1.8.4 (>=1.8.15 is strongly recommended) +* NumPy_ >= 1.19.0 +* Numexpr_ >= 2.6.2 +* Cython_ >= 0.29.21 +* c-blosc_ >= 1.4.1 (sources are bundled with PyTables sources but the user can + use an external version of sources using the :envvar:`BLOSC_DIR` environment + variable or the `--blosc` flag of the :file:`setup.py`) + +installed (for testing purposes, we are using HDF5_ 1.8.15, NumPy_ 1.10.2 +and Numexpr_ 2.5.2 currently). If you don't, fetch and install them before +proceeding. + +.. _Python: http://www.python.org +.. _HDF5: http://www.hdfgroup.org/HDF5 +.. _NumPy: http://www.numpy.org +.. _Numexpr: http://code.google.com/p/numexpr +.. _Cython: http://www.cython.org +.. _c-blosc: http://blosc.org + +Compile and install these packages (but see :ref:`prerequisitesBinInst` for +instructions on how to install pre-compiled binaries if you are not willing +to compile the prerequisites on Windows systems). + +For compression (and possibly improved performance), you will need to install +the Zlib (see :ref:`[ZLIB] `), which is also required by HDF5 as well. +You may also optionally install the excellent LZO compression library (see +:ref:`[LZO] ` and :ref:`compressionIssues`). The high-performance bzip2 +compression library can also be used with PyTables (see +:ref:`[BZIP2] `). + +The Blosc (see :ref:`[BLOSC] `) compression library is embedded +in PyTables, so this will be used in case it is not found in the +system. So, in case the installer warns about not finding it, do not +worry too much ;) + +**Unix** + + setup.py will detect HDF5, Blosc, LZO, or bzip2 libraries and include + files under :file:`/usr` or :file:`/usr/local`; this will cover most + manual installations as well as installations from packages. If setup.py + can not find libhdf5, libhdf5 (or liblzo, or libbz2 that you may wish to + use) or if you have several versions of a library installed and want to + use a particular one, then you can set the path to the resource in the + environment, by setting the values of the :envvar:`HDF5_DIR`, + :envvar:`LZO_DIR`, :envvar:`BZIP2_DIR` or :envvar:`BLOSC_DIR` environment + variables to the path to the particular resource. You may also specify the + locations of the resource root directories on the setup.py command line. + For example:: + + --hdf5=/stuff/hdf5-1.8.12 + --blosc=/stuff/blosc-1.8.1 + --lzo=/stuff/lzo-2.02 + --bzip2=/stuff/bzip2-1.0.5 + + If your HDF5 library was built as a shared library not in the runtime load + path, then you can specify the additional linker flags needed to find the + shared library on the command line as well. For example:: + + --lflags="-Xlinker -rpath -Xlinker /stuff/hdf5-1.8.12/lib" + + You may also want to try setting the :envvar:`LD_LIBRARY_PATH` + environment variable to point to the directory where the shared libraries + can be found. Check your compiler and linker documentation as well as the + Python Distutils documentation for the correct syntax or environment + variable names. + It is also possible to link with specific libraries by setting the + :envvar:`LIBS` environment variable:: + + LIBS="hdf5-1.8.12 nsl" + + Starting from PyTables 3.2 can also query the *pkg-config* database to + find the required packages. If available, pkg-config is used by default + unless explicitly disabled. + + To suppress the use of *pkg-config*:: + + $ python3 setup.py build --use-pkgconfig=FALSE + + or use the :envvar:`USE-PKGCONFIG` environment variable:: + + $ env USE_PKGCONFIG=FALSE python3 setup.py build + +**Windows** + + You can get ready-to-use Windows binaries and other development files for + most of the following libraries from the GnuWin32 project (see + :ref:`[GNUWIN32] `). In case you cannot find the LZO binaries + in the GnuWin32 repository, you can find them at + http://sourceforge.net/projects/pytables/files/lzo-win. + Once you have installed the prerequisites, setup.py needs to know where + the necessary library *stub* (.lib) and *header* (.h) files are installed. + You can set the path to the include and dll directories for the HDF5 + (mandatory) and LZO, BZIP2, BLOSC (optional) libraries in the environment, + by setting the values of the :envvar:`HDF5_DIR`, :envvar:`LZO_DIR`, + :envvar:`BZIP2_DIR` or :envvar:`BLOSC_DIR` environment variables to the + path to the particular resource. For example:: + + set HDF5_DIR=c:\\stuff\\hdf5-1.8.5-32bit-VS2008-IVF101\\release + set BLOSC_DIR=c:\\Program Files (x86)\\Blosc + set LZO_DIR=c:\\Program Files (x86)\\GnuWin32 + set BZIP2_DIR=c:\\Program Files (x86)\\GnuWin32 + + You may also specify the locations of the resource root directories on the + setup.py command line. + For example:: + + --hdf5=c:\\stuff\\hdf5-1.8.5-32bit-VS2008-IVF101\\release + --blosc=c:\\Program Files (x86)\\Blosc + --lzo=c:\\Program Files (x86)\\GnuWin32 + --bzip2=c:\\Program Files (x86)\\GnuWin32 + +**Conda** + + Pre-built packages for PyTables are available in the anaconda (default) + channel:: + + conda install pytables + + The most recent version is usually available in the conda-forge + channel:: + + conda config --add channels conda-forge + conda install pytables + + The HDF5 libraries and other helper packages are automatically found in + a conda environment. During installation setup.py uses the `CONDA_PREFIX` + environment variable to detect a conda environment. If detected it will + try to find all packages within this environment. PyTables needs at least + the hdf5 package:: + + conda install hdf5 + python3 setup.py install + + It is still possible to override package locations using the + :envvar:`HDF5_DIR`, :envvar:`LZO_DIR`, :envvar:`BZIP2_DIR` or + :envvar:`BLOSC_DIR` environment variables. + + When inside a conda environment *pkg-config* will not work. To disable + using the conda environment and fall back to *pkg-config* use `--no-conda`:: + + python3 setup.py install --no-conda + + When the `--use-pkgconfig` flag is used, `--no-conda` is assumed. + +**Development version (Unix)** + + Installation of the development version is very similar to installation + from a source package (described above). There are two main differences: + + #. sources have to be downloaded from the `PyTables source repository`_ + hosted on GitHub_. Git (see :ref:`[GIT] `) is used as VCS. + The following command create a local copy of latest development version + sources:: + + $ git clone --recursive https://github.com/PyTables/PyTables.git + + #. sources in the git repository do not include pre-built documentation + and pre-generated C code of Cython extension modules. To be able to + generate them, both Cython (see :ref:`[CYTHON] `) and + sphinx >= 1.0.7 (see :ref:`[SPHINX] `) are mandatory + prerequisites. + +.. _`PyTables source repository`: https://github.com/PyTables/PyTables +.. _GitHub: https://github.com + + +PyTables package installation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Once you have installed the HDF5 library and the NumPy and Numexpr packages, +you can proceed with the PyTables package itself. + +#. Run this command from the main PyTables distribution directory, including + any extra command line arguments as discussed above:: + + $ python3 setup.py build + + If the HDF5 installation is in a custom path, e.g. $HOME/hdf5-1.8.15pre7, + one of the following commands can be used:: + + $ python3 setup.py build --hdf5=$HOME/hdf5-1.8.15pre7 + + .. note:: + + AVX2 support is detected automatically for your machine and, if found, + it is enabled by default. In some situations you may want to disable + AVX2 explicitly (maybe your binaries have to be exported and run on + machines that do not have AVX2 support). In that case, define the + DISABLE_AVX2 environment variable:: + + $ DISABLE_AVX2=True python3 setup.py build # for bash and its variants + +#. To run the test suite, execute any of these commands. + + **Unix** + In the sh shell and its variants:: + + $ cd build/lib.linux-x86_64-3.3 + $ env PYTHONPATH=. python3 tables/tests/test_all.py + + or, if you prefer:: + + $ cd build/lib.linux-x86_64-3.3 + $ env PYTHONPATH=. python3 -c "import tables; tables.test()" + + .. note:: + + the syntax used above overrides original contents of the + :envvar:`PYTHONPATH` environment variable. + If this is not the desired behaviour and the user just wants to add + some path before existing ones, then the safest syntax to use is + the following:: + + $ env PYTHONPATH=.${PYTHONPATH:+:$PYTHONPATH} python3 tables/tests/test_all.py + + Please refer to your :program:`sh` documentation for details. + + **Windows** + + Open the command prompt (cmd.exe or command.com) and type:: + + > cd build\\lib.linux-x86_64-2.7 + > set PYTHONPATH=.;%PYTHONPATH% + > python3 tables\\tests\\test_all.py + + or:: + + > cd build\\lib.linux-x86_64-2.7 + > set PYTHONPATH=.;%PYTHONPATH% + > python3 -c "import tables; tables.test()" + + Both commands do the same thing, but the latter still works on an already + installed PyTables (so, there is no need to set the :envvar:`PYTHONPATH` + variable for this case). + However, before installation, the former is recommended because it is + more flexible, as you can see below. + If you would like to see verbose output from the tests simply add the + `-v` flag and/or the word verbose to the first of the command lines + above. You can also run only the tests in a particular test module. + For example, to execute just the test_types test suite, you only have to + specify it:: + + # change to backslashes for win + $ python3 tables/tests/test_types.py -v + + You have other options to pass to the :file:`test_all.py` driver:: + + # change to backslashes for win + $ python3 tables/tests/test_all.py --heavy + + The command above runs every test in the test unit. Beware, it can take a + lot of time, CPU and memory resources to complete:: + + # change to backslashes for win + $ python3 tables/tests/test_all.py --print-versions + + The command above shows the versions for all the packages that PyTables + relies on. Please be sure to include this when reporting bugs:: + + # only under Linux 2.6.x + $ python3 tables/tests/test_all.py --show-memory + + The command above prints out the evolution of the memory consumption after + each test module completion. It's useful for locating memory leaks in + PyTables (or packages behind it). Only valid for Linux 2.6.x kernels. + And last, but not least, in case a test fails, please run the failing test + module again and enable the verbose output:: + + $ python3 tables/tests/test_.py -v verbose + + and, very important, obtain your PyTables version information by using the + `--print-versions` flag (see above) and send back both outputs to + developers so that we may continue improving PyTables. + If you run into problems because Python can not load the HDF5 library or + other shared libraries. + + **Unix** + + Try setting the LD_LIBRARY_PATH or equivalent environment variable to + point to the directory where the missing libraries can be found. + + **Windows** + + Put the DLL libraries (hdf5dll.dll and, optionally, lzo1.dll, + bzip2.dll or blosc.dll) in a directory listed in your + :envvar:`PATH` environment variable. The setup.py installation + program will print out a warning to that effect if the libraries + can not be found. + +#. To install the entire PyTables Python package, change back to the root + distribution directory and run the following command (make sure you have + sufficient permissions to write to the directories where the PyTables files + will be installed):: + + $ python3 setup.py install + + Again if one needs to point to libraries installed in custom paths, then + specific setup.py options can be used:: + + $ python3 setup.py install --hdf5=/hdf5/custom/path + + or:: + + $ env HDF5_DIR=/hdf5/custom/path python3 setup.py install + + Of course, you will need super-user privileges if you want to install + PyTables on a system-protected area. You can select, though, a different + place to install the package using the `--prefix` flag:: + + $ python3 setup.py install --prefix="/home/myuser/mystuff" + + Have in mind, however, that if you use the `--prefix` flag to + install in a non-standard place, you should properly setup your + :envvar:`PYTHONPATH` environment variable, so that the Python interpreter + would be able to find your new PyTables installation. + You have more installation options available in the Distutils package. + Issue a:: + + $ python3 setup.py install --help + + for more information on that subject. + +That's it! Now you can skip to the next chapter to learn how to use PyTables. + + +Installation with :program:`pip` +-------------------------------- + +Many users find it useful to use the :program:`pip` program (or similar ones) +to install python packages. + +As explained in previous sections the user should in any case ensure that all +dependencies listed in the `Prerequisites`_ section are correctly installed. + +The simplest way to install PyTables using :program:`pip` is the following:: + + $ python3 -m pip install tables + +The following example shows how to install the latest stable version of +PyTables in the user folder when a older version of the package is already +installed at system level:: + + $ python3 -m pip install --user --upgrade tables + +The `--user` option tells to the :program:`pip` tool to install the package in +the user folder (``$HOME/.local`` on GNU/Linux and Unix systems), while the +`--upgrade` option forces the installation of the latest version even if an +older version of the package is already installed. + +Additional options for the setup.py script can be specified using them +`--install-option`:: + + $ python3 -m pip install --install-option='--hdf5=/custom/path/to/hdf5' tables + +or:: + + $ env HDF5_DIR=/custom/path/to/hdf5 python3 -m pip install tables + +The :program:`pip` tool can also be used to install packages from a source +tar-ball:: + + $ python3 -m pip install tables-3.0.0.tar.gz + +To install the development version of PyTables from the *develop* branch of +the main :program:`git` :ref:`[GIT] ` repository the command is the +following:: + + $ python3 -m pip install git+https://github.com/PyTables/PyTables.git@develop#egg=tables + +A similar command can be used to install a specific tagged version:: + + $ python3 -m pip install git+https://github.com/PyTables/PyTables.git@v.2.4.0#egg=tables + +Of course the `pip` can be used to install only python packages. +Other dependencies like the HDF5 library of compression libraries have to +be installed by the user. + +.. note:: + + Recent versions of Debian_ and Ubuntu_ the HDF5 library is installed in + with a very peculiar layout that allows to have both the serial and MPI + versions installed at the same time. + + PyTables >= 3.2 natively supports the new layout via *pkg-config* (that + is expected to be installed on the system at build time). + + If *pkg-config* is not available or PyTables is older than version 3.2, + then the following command can be used:: + + $ env CPPFLAGS=-I/usr/include/hdf5/serial \ + LDFLAGS=-L/usr/lib/x86_64-linux-gnu/hdf5/serial python3 setup.py install + + or:: + + $ env CPPFLAGS=-I/usr/include/hdf5/serial \ + LDFLAGS=-L/usr/lib/x86_64-linux-gnu/hdf5/serial python3 -m pip install tables + +.. _Debian: https://www.debian.org +.. _Ubuntu: http://www.ubuntu.com + + +.. _binaryInstallationDescr: + +Binary installation (Windows) +----------------------------- + +This section is intended for installing precompiled binaries on Windows +platforms. Binaries are distribution in wheel format, which can be downloaded +and installed using pip as described above. You may also find it useful for +instructions on how to install *binary prerequisites* even if you want to +compile PyTables itself on Windows. + +.. _prerequisitesBinInst: + +Windows prerequisites +~~~~~~~~~~~~~~~~~~~~~ + +First, make sure that you have Python 3, NumPy 1.8.0 and Numexpr 2.5.2 or +higher installed. + +To enable compression with the optional LZO library (see the +:ref:`compressionIssues` for hints about how it may be used to improve +performance), fetch and install the LZO from +http://sourceforge.net/projects/pytables/files/lzo-win (choose v1.x for +Windows 32-bit and v2.x for Windows 64-bit). +Normally, you will only need to fetch that package and copy the included +lzo1.dll/lzo2.dll file in a directory in the PATH environment variable +(for example C:\\WINDOWS\\SYSTEM) or +python_installation_path\\Lib\\site-packages\\tables (the last directory may +not exist yet, so if you want to install the DLL there, you should do so +*after* installing the PyTables package), so that it can be found by the +PyTables extensions. + +Please note that PyTables has internal machinery for dealing with uninstalled +optional compression libraries, so, you don't need to install the LZO or bzip2 +dynamic libraries if you don't want to. + + +PyTables package installation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +On PyPI wheels for 32 and 64-bit versions of Windows and are usually provided. They +are automatically found and installed using pip:: + + $ python3 -m pip install tables + +If a matching wheel cannot be found for your installation, third party built wheels +can be found e.g. at the `Unofficial Windows Binaries for Python Extension Packages +`_ page. Download the wheel +matching the version of python and either the 32 or 64-bit version and install +using pip:: + + # python 3.6 64-bit: + $ python3 -m pip install tables-3.6.1-2-cp36-cp36m-win_amd64.whl + +You can (and *you should*) test your installation by running the next +commands:: + + >>> import tables + >>> tables.test() + +on your favorite python shell. If all the tests pass (possibly with a few +warnings, related to the potential unavailability of LZO lib) you already have +a working, well-tested copy of PyTables installed! If any test fails, please +copy the output of the error messages as well as the output of:: + + >>> tables.print_versions() + +and mail them to the developers so that the problem can be fixed in future +releases. + +You can proceed now to the next chapter to see how to use PyTables. diff --git a/doc/source/usersguide/introduction.rst b/doc/source/usersguide/introduction.rst new file mode 100644 index 0000000..9e4d4a3 --- /dev/null +++ b/doc/source/usersguide/introduction.rst @@ -0,0 +1,319 @@ +Introduction +============ +.. epigraph:: + + La sabiduría no vale la pena si no es posible servirse de ella para + inventar una nueva manera de preparar los garbanzos. + + [Wisdom isn't worth anything if you can't use it to come up with a new + way to cook garbanzos.] + + -- Gabriel García Márquez, A wise Catalan in *"Cien años de soledad"* + + +The goal of PyTables is to enable the end user to manipulate easily data +*tables* and *array* objects in a hierarchical structure. The foundation of +the underlying hierarchical data organization is the excellent HDF5 library +(see :ref:`[HDGF1] `). + +It should be noted that this package is not intended to serve as a complete +wrapper for the entire HDF5 API, but only to provide a flexible, *very +pythonic* tool to deal with (arbitrarily) large amounts of data (typically +bigger than available memory) in tables and arrays organized in a +hierarchical and persistent disk storage structure. + +A table is defined as a collection of records whose values are stored in +*fixed-length* fields. All records have the same structure and all values in +each field have the same *data type*. The terms *fixed-length* and strict +*data types* may seem to be a strange requirement for an interpreted language +like Python, but they serve a useful function if the goal is to save very +large quantities of data (such as is generated by many data acquisition +systems, Internet services or scientific applications, for example) in an +efficient manner that reduces demand on CPU time and I/O. + +In order to emulate in Python records mapped to HDF5 C structs PyTables +implements a special class so as to easily define all its fields and other +properties. PyTables also provides a powerful interface to mine data in +tables. Records in tables are also known in the HDF5 naming scheme as +*compound* data types. + +For example, you can define arbitrary tables in Python simply by declaring a +class with named fields and type information, such as in the following +example:: + + class Particle(IsDescription): + name = StringCol(16) # 16-character String + idnumber = Int64Col() # signed 64-bit integer + ADCcount = UInt16Col() # unsigned short integer + TDCcount = UInt8Col() # unsigned byte + grid_i = Int32Col() # integer + grid_j = Int32Col() # integer + + # A sub-structure (nested data-type) + class Properties(IsDescription): + pressure = Float32Col(shape=(2,3)) # 2-D float array (single-precision) + energy = Float64Col(shape=(2,3,4)) # 3-D float array (double-precision) + +You then pass this class to the table constructor, fill its rows with your +values, and save (arbitrarily large) collections of them to a file for +persistent storage. After that, the data can be retrieved and post-processed +quite easily with PyTables or even with another HDF5 application (in C, +Fortran, Java or whatever language that provides a library to interface with +HDF5). + +Other important entities in PyTables are *array* objects, which are analogous +to tables with the difference that all of their components are homogeneous. +They come in different flavors, like *generic* (they provide a quick and fast +way to deal with for numerical arrays), *enlargeable* (arrays can be extended +along a single dimension) and *variable length* (each row in the array can +have a different number of elements). + +The next section describes the most interesting capabilities of PyTables. + + +Main Features +------------- +PyTables takes advantage of the object orientation and introspection +capabilities offered by Python, the powerful data management features of +HDF5, and NumPy's flexibility and Numexpr's high-performance manipulation of +large sets of objects organized in a grid-like fashion to provide these +features: + +- *Support for table entities:* You can tailor your data adding or deleting + records in your tables. Large numbers of rows (up to 2**63, much more than + will fit into memory) are supported as well. + +- *Multidimensional and nested table cells:* You can declare a column to + consist of values having any number of dimensions besides scalars, which is + the only dimensionality allowed by the majority of relational databases. + You can even declare columns that are made of other columns (of different + types). + +- *Indexing support for columns of tables:* + Very useful if you have large tables and you want to quickly look up for + values in columns satisfying some criteria. + +- *Support for numerical arrays:* + NumPy (see :ref:`[NUMPY] `) arrays can be used as a useful + complement of tables to store homogeneous data. + +- *Enlargeable arrays:* You can add new + elements to existing arrays on disk in any dimension you want (but only + one). Besides, you are able to access just a slice of your datasets by + using the powerful extended slicing mechanism, without need to load all + your complete dataset in memory. + +- *Variable length arrays:* The number of elements in these arrays can vary + from row to row. This provides a lot of flexibility when dealing with + complex data. + +- *Supports a hierarchical data model:* + Allows the user to clearly structure all data. PyTables builds up an + *object tree* in memory that replicates the underlying file data structure. + Access to objects in the file is achieved by walking through and + manipulating this object tree. + Besides, this object tree is built in a lazy way, for efficiency purposes. + +- *User defined metadata:* Besides + supporting system metadata (like the number of rows of a table, shape, + flavor, etc.) the user may specify arbitrary metadata (as for example, room + temperature, or protocol for IP traffic that was collected) that complement + the meaning of actual data. + +- *Ability to read/modify generic HDF5 files:* PyTables can access a wide + range of objects in generic HDF5 files, like compound type datasets (that + can be mapped to Table objects), homogeneous datasets (that can be mapped + to Array objects) or variable length record datasets (that can be mapped to + VLArray objects). Besides, if a dataset is not supported, it will be mapped + to a special UnImplemented class (see :ref:`UnImplementedClassDescr`), that + will let the user see that the data is there, although it will be + unreachable (still, you will be able to access the attributes and some + metadata in the dataset). With that, PyTables probably can access and + *modify* most of the HDF5 files out there. + +- *Data compression:* Supports data compression (using the *Zlib*, *LZO*, + *bzip2* and *Blosc* compression libraries) out of the box. This is + important when you have repetitive data patterns and don't want to spend + time searching for an optimized way to store them (saving you time spent + analyzing your data organization). + +- *High performance I/O:* On modern systems storing large amounts of data, + tables and array objects can be read and written at a speed only limited by + the performance of the underlying I/O subsystem. Moreover, if your data is + compressible, even that limit is surmountable! + +- *Support of files bigger than 2 GB:* + PyTables automatically inherits this capability from the underlying HDF5 + library (assuming your platform supports the C long long integer, or, on + Windows, __int64). + +- *Architecture-independent:* PyTables has been carefully coded (as HDF5 + itself) with little-endian/big-endian byte ordering issues in mind. So, you + can write a file on a big-endian machine (like a Sparc or MIPS) and read it + on other little-endian machine (like an Intel or Alpha) without problems. + In addition, it has been tested successfully with 64 bit platforms + (Intel-64, AMD-64, PowerPC-G5, MIPS, UltraSparc) using code generated with + 64 bit aware compilers. + + +.. _ObjectTreeSection: + +The Object Tree +--------------- +The hierarchical model of the underlying HDF5 library allows PyTables to +manage tables and arrays in a tree-like structure. In order to achieve this, +an *object tree* entity is *dynamically* created imitating the HDF5 structure +on disk. The HDF5 objects are read by walking through this object tree. You +can get a good picture of what kind of data is kept in the object by +examining the *metadata* nodes. + +The different nodes in the object tree are instances of PyTables classes. +There are several types of classes, but the most important ones are the Node, +Group and Leaf classes. All nodes in a PyTables tree are instances of the +Node class. The Group and Leaf classes are descendants of Node. Group +instances (referred to as *groups* from now on) are a grouping structure +containing instances of zero or more groups or leaves, together with +supplementary metadata. Leaf instances (referred to as *leaves*) are +containers for actual data and can not contain further groups or leaves. The +Table, Array, CArray, EArray, VLArray and UnImplemented classes are +descendants of Leaf, and inherit all its properties. + +Working with groups and leaves is similar in many ways to working with +directories and files on a Unix filesystem, i.e. a node (file or directory) +is always a *child* of one and only one group (directory), its *parent group* +[1]_. +Inside of that group, the node is accessed by its *name*. As is the case with +Unix directories and files, objects in the object tree are often referenced +by giving their full (absolute) path names. In PyTables this full path can be +specified either as string (such as '/subgroup2/table3', using / as a +parent/child separator) or as a complete object path written in a format +known as the *natural name* schema (such as file.root.subgroup2.table3). + +Support for *natural naming* is a key aspect of PyTables. It means that the +names of instance variables of the node objects are the same as the names of +its children [2]_. This is very *Pythonic* and intuitive in many cases. Check +the tutorial :ref:`readingAndSelectingUsage` for usage examples. + +You should also be aware that not all the data present in a file is loaded +into the object tree. The *metadata* (i.e. special data that describes the +structure of the actual data) is loaded only when the user want to access to +it (see later). Moreover, the actual data is not read until she request it +(by calling a method on a particular node). Using the object tree (the +metadata) you can retrieve information about the objects on disk such as +table names, titles, column names, data types in columns, numbers of rows, +or, in the case of arrays, their shapes, typecodes, etc. You can also search +through the tree for specific kinds of data then read it and process it. In a +certain sense, you can think of PyTables as a tool that applies the same +introspection capabilities of Python objects to large amounts of data in +persistent storage. + +It is worth noting that PyTables sports a *metadata cache system* that loads +nodes *lazily* (i.e. on-demand), and unloads nodes that have not been used +for some time (following a *Least Recently Used* schema). It is important to +stress out that the nodes enter the cache after they have been unreferenced +(in the sense of Python reference counting), and that they can be revived (by +referencing them again) directly from the cache without performing the +de-serialization process from disk. This feature allows dealing with files +with large hierarchies very quickly and with low memory consumption, while +retaining all the powerful browsing capabilities of the previous +implementation of the object tree. See :ref:`[OPTIM] ` for more facts +about the advantages introduced by this new metadata cache system. + +To better understand the dynamic nature of this object tree entity, let's +start with a sample PyTables script (which you can find in +examples/objecttree.py) to create an HDF5 file:: + + import tables as tb + + class Particle(tb.IsDescription): + identity = tb.StringCol(itemsize=22, dflt=" ", pos=0) # character String + idnumber = tb.Int16Col(dflt=1, pos = 1) # short integer + speed = tb.Float32Col(dflt=1, pos = 2) # single-precision + + # Open a file in "w"rite mode + fileh = tb.open_file("objecttree.h5", mode = "w") + + # Get the HDF5 root group + root = fileh.root + + # Create the groups + group1 = fileh.create_group(root, "group1") + group2 = fileh.create_group(root, "group2") + + # Now, create an array in root group + array1 = fileh.create_array(root, "array1", ["string", "array"], "String array") + + # Create 2 new tables in group1 + table1 = fileh.create_table(group1, "table1", Particle) + table2 = fileh.create_table("/group2", "table2", Particle) + + # Create the last table in group2 + array2 = fileh.create_array("/group1", "array2", [1,2,3,4]) + + # Now, fill the tables + for table in (table1, table2): + # Get the record object associated with the table: + row = table.row + + # Fill the table with 10 records + for i in range(10): + # First, assign the values to the Particle record + row['identity'] = f'This is particle: {i:2d}' + row['idnumber'] = i + row['speed'] = i * 2. + + # This injects the Record values + row.append() + + # Flush the table buffers + table.flush() + + # Finally, close the file (this also will flush all the remaining buffers!) + fileh.close() + +This small program creates a simple HDF5 file called objecttree.h5 with the +structure that appears in :ref:`Figure 1 ` [3]_. +When the file is created, the metadata in the object tree is updated in +memory while the actual data is saved to disk. When you close the file the +object tree is no longer available. However, when you reopen this file the +object tree will be reconstructed in memory from the metadata on disk (this +is done in a lazy way, in order to load only the objects that are required by +the user), allowing you to work with it in exactly the same way as when you +originally created it. + +.. _objecttree-h5: + +.. figure:: images/objecttree-h5.png + :align: center + + **Figure 1: An HDF5 example with 2 subgroups, 2 tables and 1 array.** + +In :ref:`Figure2 `, you can see an example of the object tree +created when the above objecttree.h5 file is read (in fact, such an object +tree is always created when reading any supported generic HDF5 file). +It is worthwhile to take your time to understand it [4]_. +It will help you understand the relationships of in-memory PyTables objects. + +.. _objecttree: + +.. figure:: images/objecttree.png + :width: 100% + :align: center + + **Figure 2: A PyTables object tree example.** + +--------------------------- + +.. [1] PyTables does not support hard links - for the moment. + +.. [2] I got this simple but powerful idea from the excellent Objectify + module by David Mertz (see :ref:`[MERTZ] `). + +.. [3] We have used ViTables (see :ref:`[VITABLES] `) in order to + create this snapshot. + +.. [4] Bear in mind, however, that this diagram is *not* a standard UML class + diagram; it is rather meant to show the connections between the + PyTables objects and some of its most important attributes and + methods. + diff --git a/doc/source/usersguide/libref.rst b/doc/source/usersguide/libref.rst new file mode 100644 index 0000000..b7fd7aa --- /dev/null +++ b/doc/source/usersguide/libref.rst @@ -0,0 +1,36 @@ +.. _library_reference: + +Library Reference +================= +PyTables implements several classes to represent the different nodes in the +object tree. They are named File, Group, Leaf, Table, Array, CArray, EArray, +VLArray and UnImplemented. Another one allows the user to complement the +information on these different objects; its name is AttributeSet. Finally, +another important class called IsDescription allows to build a Table record +description by declaring a subclass of it. Many other classes are defined in +PyTables, but they can be regarded as helpers whose goal is mainly to declare +the *data type properties* of the different first class objects and will be +described at the end of this chapter as well. + +An important function, called open_file is responsible to create, open or append +to files. In addition, a few utility functions are defined to guess if the user +supplied file is a *PyTables* or *HDF5* file. These are called is_pytables_file() +and is_hdf5_file(), respectively. There exists also a function called +which_lib_version() that informs about the versions of the underlying C libraries +(for example, HDF5 or Zlib) and another called print_versions() that prints all +the versions of the software that PyTables relies on. Finally, test() lets you +run the complete test suite from a Python console interactively. + +.. toctree:: + :maxdepth: 2 + + libref/top_level + libref/file_class + libref/hierarchy_classes + libref/structured_storage + libref/homogenous_storage + libref/link_classes + libref/declarative_classes + libref/helper_classes + libref/expr_class + libref/filenode_classes diff --git a/doc/source/usersguide/libref/declarative_classes.rst b/doc/source/usersguide/libref/declarative_classes.rst new file mode 100644 index 0000000..55fed84 --- /dev/null +++ b/doc/source/usersguide/libref/declarative_classes.rst @@ -0,0 +1,280 @@ +.. currentmodule:: tables + +Declarative classes +=================== +In this section a series of classes that are meant to +*declare* datatypes that are required for creating +primary PyTables datasets are described. + + +.. _AtomClassDescr: + +The Atom class and its descendants +---------------------------------- +.. autoclass:: Atom + +.. These are defined in the class docstring + Atom instance variables + ^^^^^^^^^^^^^^^^^^^^^^^ + .. autoattribute:: Atom.dflt + .. autoattribute:: Atom.dtype + .. autoattribute:: Atom.itemsize + .. autoattribute:: Atom.kind + .. autoattribute:: Atom.shape + .. autoattribute:: Atom.type + + +Atom properties +~~~~~~~~~~~~~~~ +.. autoattribute:: Atom.ndim + +.. autoattribute:: Atom.recarrtype + +.. autoattribute:: Atom.size + + +Atom methods +~~~~~~~~~~~~ +.. automethod:: Atom.copy + + +Atom factory methods +~~~~~~~~~~~~~~~~~~~~ +.. automethod:: Atom.from_dtype + +.. automethod:: Atom.from_kind + +.. automethod:: Atom.from_sctype + +.. automethod:: Atom.from_type + + +Atom Sub-classes +~~~~~~~~~~~~~~~~ +.. autoclass:: StringAtom + :members: + +.. autoclass:: BoolAtom + :members: + +.. autoclass:: IntAtom + :members: + +.. autoclass:: Int8Atom + :members: + +.. autoclass:: Int16Atom + :members: + +.. autoclass:: Int32Atom + :members: + +.. autoclass:: Int64Atom + :members: + +.. autoclass:: UIntAtom + :members: + +.. autoclass:: UInt8Atom + :members: + +.. autoclass:: UInt16Atom + :members: + +.. autoclass:: UInt32Atom + :members: + +.. autoclass:: UInt64Atom + :members: + +.. autoclass:: FloatAtom + :members: + +.. autoclass:: Float32Atom + :members: + +.. autoclass:: Float64Atom + :members: + +.. autoclass:: ComplexAtom + :members: + +.. autoclass:: Time32Atom + :members: + +.. autoclass:: Time64Atom + :members: + +.. autoclass:: EnumAtom + :members: + + +Pseudo atoms +~~~~~~~~~~~~ +Now, there come three special classes, ObjectAtom, VLStringAtom and +VLUnicodeAtom, that actually do not descend from Atom, but which goal is so +similar that they should be described here. Pseudo-atoms can only be used with +VLArray datasets (see :ref:`VLArrayClassDescr`), and they do not support +multidimensional values, nor multiple values per row. + +They can be recognised because they also have kind, type and shape attributes, +but no size, itemsize or dflt ones. Instead, they have a base atom which +defines the elements used for storage. + +See :file:`examples/vlarray1.py` and :file:`examples/vlarray2.py` for further +examples on VLArray datasets, including object serialization and string +management. + + +ObjectAtom +^^^^^^^^^^ +.. autoclass:: ObjectAtom + :members: + + +.. _VLStringAtom: + +VLStringAtom +^^^^^^^^^^^^ +.. autoclass:: VLStringAtom + :members: + + +.. _VLUnicodeAtom: + +VLUnicodeAtom +^^^^^^^^^^^^^ +.. autoclass:: VLUnicodeAtom + :members: + + +.. _ColClassDescr: + +The Col class and its descendants +--------------------------------- +.. autoclass:: Col + +.. + Col instance variables + ^^^^^^^^^^^^^^^^^^^^^^ + .. autoattribute:: _v_pos + + +Col instance variables +~~~~~~~~~~~~~~~~~~~~~~ +In addition to the variables that they inherit from the Atom class, Col +instances have the following attributes. + +.. attribute:: Col._v_pos + + The *relative* position of this column with regard to its column + siblings. + + +Col factory methods +~~~~~~~~~~~~~~~~~~~ +.. automethod:: Col.from_atom + + +Col sub-classes +~~~~~~~~~~~~~~~ +.. autoclass:: StringCol + :members: + +.. autoclass:: BoolCol + :members: + +.. autoclass:: IntCol + :members: + +.. autoclass:: Int8Col + :members: + +.. autoclass:: Int16Col + :members: + +.. autoclass:: Int32Col + :members: + +.. autoclass:: Int64Col + :members: + +.. autoclass:: UIntCol + :members: + +.. autoclass:: UInt8Col + :members: + +.. autoclass:: UInt16Col + :members: + +.. autoclass:: UInt32Col + :members: + +.. autoclass:: UInt64Col + :members: + +.. autoclass:: Float32Col + :members: + +.. autoclass:: Float64Col + :members: + +.. autoclass:: ComplexCol + :members: + +.. autoclass:: TimeCol + :members: + +.. autoclass:: Time32Col + :members: + +.. autoclass:: Time64Col + :members: + +.. autoclass:: EnumCol + :members: + + +.. _IsDescriptionClassDescr: + +The IsDescription class +----------------------- +.. autoclass:: IsDescription + + +Description helper functions +---------------------------- +.. autofunction:: tables.description.descr_from_dtype + +.. autofunction:: tables.description.dtype_from_descr + + +.. _AttributeSetClassDescr: + +The AttributeSet class +---------------------- +.. autoclass:: tables.attributeset.AttributeSet + +.. These are defined in the class docstring + AttributeSet attributes + ~~~~~~~~~~~~~~~~~~~~~~~ + .. autoattribute:: tables.attributeset.AttributeSet._v_attrnames + .. autoattribute:: tables.attributeset.AttributeSet._v_attrnamessys + .. autoattribute:: tables.attributeset.AttributeSet._v_attrnamesuser + .. autoattribute:: tables.attributeset.AttributeSet._v_unimplemented + +AttributeSet properties +~~~~~~~~~~~~~~~~~~~~~~~ +.. autoattribute:: tables.attributeset.AttributeSet._v_node + + +AttributeSet methods +~~~~~~~~~~~~~~~~~~~~ +.. automethod:: tables.attributeset.AttributeSet._f_copy + +.. automethod:: tables.attributeset.AttributeSet._f_list + +.. automethod:: tables.attributeset.AttributeSet._f_rename + +.. automethod:: tables.attributeset.AttributeSet.__contains__ diff --git a/doc/source/usersguide/libref/expr_class.rst b/doc/source/usersguide/libref/expr_class.rst new file mode 100644 index 0000000..3a0f215 --- /dev/null +++ b/doc/source/usersguide/libref/expr_class.rst @@ -0,0 +1,37 @@ +.. currentmodule:: tables + +General purpose expression evaluator class +========================================== + +The Expr class +-------------- +.. autoclass:: Expr + +.. These are defined in the class docstring. + Expr instance variables + ~~~~~~~~~~~~~~~~~~~~~~~ + .. autoattribute:: Expr.append_mode + .. autoattribute:: Expr.maindim + .. autoattribute:: Expr.names + .. autoattribute:: Expr.out + .. autoattribute:: Expr.o_start + .. autoattribute:: Expr.o_stop + .. autoattribute:: Expr.o_step + .. autoattribute:: Expr.shape + .. autoattribute:: Expr.values + + +Expr methods +~~~~~~~~~~~~ +.. automethod:: Expr.eval + +.. automethod:: Expr.set_inputs_range + +.. automethod:: Expr.set_output + +.. automethod:: Expr.set_output_range + + +Expr special methods +~~~~~~~~~~~~~~~~~~~~ +.. automethod:: Expr.__iter__ diff --git a/doc/source/usersguide/libref/file_class.rst b/doc/source/usersguide/libref/file_class.rst new file mode 100644 index 0000000..0ec9ca3 --- /dev/null +++ b/doc/source/usersguide/libref/file_class.rst @@ -0,0 +1,137 @@ +.. currentmodule:: tables + +File manipulation class +======================= + +.. _FileClassDescr: + +The File Class +-------------- +.. autoclass:: File + +.. These are defined in the class docstring. + This is necessary because attributes created in a class's + __init__ method can't be documented with autoattribute. + See Sphinx bug #904. + https://bitbucket.org/birkenfeld/sphinx/issue/904 + + Attributes + ~~~~~~~~~~ + .. autoattribute:: File.filename + .. autoattribute:: File.format_version + .. autoattribute:: File.isopen + .. autoattribute:: File.mode + .. autoattribute:: File.root + .. autoattribute:: File.root_uep + + +File properties +~~~~~~~~~~~~~~~ +.. autoattribute:: File.title + +.. autoattribute:: File.filters + + +File methods - file handling +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. automethod:: File.close + +.. automethod:: File.copy_file + +.. automethod:: File.flush + +.. automethod:: File.fileno + +.. automethod:: File.__enter__ + +.. automethod:: File.__exit__ + +.. automethod:: File.__str__ + +.. automethod:: File.__repr__ + +.. automethod:: File.get_file_image + +.. automethod:: File.get_filesize + +.. automethod:: File.get_userblock_size + + +File methods - hierarchy manipulation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. automethod:: File.copy_children + +.. automethod:: File.copy_node + +.. automethod:: File.create_array + +.. automethod:: File.create_carray + +.. automethod:: File.create_earray + +.. automethod:: File.create_external_link + +.. automethod:: File.create_group + +.. automethod:: File.create_hard_link + +.. automethod:: File.create_soft_link + +.. automethod:: File.create_table + +.. automethod:: File.create_vlarray + +.. automethod:: File.move_node + +.. automethod:: File.remove_node + +.. automethod:: File.rename_node + + +File methods - tree traversal +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. automethod:: File.get_node + +.. automethod:: File.is_visible_node + +.. automethod:: File.iter_nodes + +.. automethod:: File.list_nodes + +.. automethod:: File.walk_groups + +.. automethod:: File.walk_nodes + +.. automethod:: File.__contains__ + +.. automethod:: File.__iter__ + + +File methods - Undo/Redo support +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. automethod:: File.disable_undo + +.. automethod:: File.enable_undo + +.. automethod:: File.get_current_mark + +.. automethod:: File.goto + +.. automethod:: File.is_undo_enabled + +.. automethod:: File.mark + +.. automethod:: File.redo + +.. automethod:: File.undo + + +File methods - attribute handling +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. automethod:: File.copy_node_attrs + +.. automethod:: File.del_node_attr + +.. automethod:: File.get_node_attr + +.. automethod:: File.set_node_attr diff --git a/doc/source/usersguide/libref/filenode_classes.rst b/doc/source/usersguide/libref/filenode_classes.rst new file mode 100644 index 0000000..ee0cd7d --- /dev/null +++ b/doc/source/usersguide/libref/filenode_classes.rst @@ -0,0 +1,151 @@ +.. currentmodule:: tables.nodes.filenode + +.. _filenode_classes: + +Filenode Module +=============== + +.. automodule:: tables.nodes.filenode + + +Module constants +---------------- + +.. autodata:: NodeType + +.. autodata:: NodeTypeVersions + + +Module functions +---------------- + +.. autofunction:: new_node + +.. autofunction:: open_node + +.. autofunction:: read_from_filenode + +.. autofunction:: save_to_filenode + + +The RawPyTablesIO base class +---------------------------- + +.. autoclass:: RawPyTablesIO + + +RawPyTablesIO attributes +~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoattribute:: RawPyTablesIO.mode + + +RawPyTablesIO methods +~~~~~~~~~~~~~~~~~~~~~ + +.. automethod:: RawPyTablesIO.tell + +.. automethod:: RawPyTablesIO.seek + +.. automethod:: RawPyTablesIO.seekable + +.. automethod:: RawPyTablesIO.fileno + +.. automethod:: RawPyTablesIO.close + +.. automethod:: RawPyTablesIO.flush + +.. automethod:: RawPyTablesIO.truncate + +.. automethod:: RawPyTablesIO.readable + +.. automethod:: RawPyTablesIO.writable + +.. automethod:: RawPyTablesIO.readinto + +.. automethod:: RawPyTablesIO.readline + +.. automethod:: RawPyTablesIO.write + + +The ROFileNode class +-------------------- + +.. autoclass:: ROFileNode + + +ROFileNode attributes +~~~~~~~~~~~~~~~~~~~~~ + +.. autoattribute:: ROFileNode.attrs + + +ROFileNode methods +~~~~~~~~~~~~~~~~~~ + +.. automethod:: ROFileNode.flush + +.. automethod:: ROFileNode.read + +.. automethod:: ROFileNode.readline + +.. automethod:: ROFileNode.readlines + +.. automethod:: ROFileNode.close + +.. automethod:: ROFileNode.seek + +.. automethod:: ROFileNode.tell + +.. automethod:: ROFileNode.readable + +.. automethod:: ROFileNode.writable + +.. automethod:: ROFileNode.seekable + +.. automethod:: ROFileNode.fileno + + +The RAFileNode class +-------------------- + +.. autoclass:: RAFileNode + + +RAFileNode attributes +~~~~~~~~~~~~~~~~~~~~~ + +.. autoattribute:: RAFileNode.attrs + + +RAFileNode methods +~~~~~~~~~~~~~~~~~~ + +.. automethod:: RAFileNode.flush + +.. automethod:: RAFileNode.read + +.. automethod:: RAFileNode.readline + +.. automethod:: RAFileNode.readlines + +.. automethod:: RAFileNode.truncate + +.. automethod:: RAFileNode.write + +.. automethod:: RAFileNode.writelines + +.. automethod:: RAFileNode.close + +.. automethod:: RAFileNode.seek + +.. automethod:: RAFileNode.tell + +.. automethod:: RAFileNode.readable + +.. automethod:: RAFileNode.writable + +.. automethod:: RAFileNode.seekable + +.. automethod:: RAFileNode.fileno + diff --git a/doc/source/usersguide/libref/helper_classes.rst b/doc/source/usersguide/libref/helper_classes.rst new file mode 100644 index 0000000..93e426d --- /dev/null +++ b/doc/source/usersguide/libref/helper_classes.rst @@ -0,0 +1,152 @@ +.. currentmodule:: tables + + +Helper classes +============== +This section describes some classes that do not fit in any other +section and that mainly serve for ancillary purposes. + + +.. _FiltersClassDescr: + +The Filters class +----------------- +.. autoclass:: Filters + +.. These are defined in the class docstring. + Filters instance variables + ^^^^^^^^^^^^^^^^^^^^^^^^^^ + .. autoattribute:: Filters.bitshuffle + .. autoattribute:: Filters.fletcher32 + .. autoattribute:: Filters.complevel + .. autoattribute:: Filters.complib + .. autoattribute:: Filters.shuffle + + +Filters methods +~~~~~~~~~~~~~~~ +.. automethod:: Filters.copy + + +.. _IndexClassDescr: + +The Index class +--------------- +.. autoclass:: tables.index.Index + +.. This is defined in the class docstring + .. autoattribute:: tables.index.Index.nelements + +Index instance variables +~~~~~~~~~~~~~~~~~~~~~~~~ +.. autoattribute:: tables.index.Index.column + +.. autoattribute:: tables.index.Index.dirty + +.. autoattribute:: tables.index.Index.filters + +.. autoattribute:: tables.index.Index.is_csi + +.. attribute:: tables.index.Index.nelements + + The number of currently indexed rows for this column. + + +Index methods +~~~~~~~~~~~~~ +.. automethod:: tables.index.Index.read_sorted + +.. automethod:: tables.index.Index.read_indices + + +Index special methods +~~~~~~~~~~~~~~~~~~~~~ +.. automethod:: tables.index.Index.__getitem__ + + +The IndexArray class +-------------------- + +.. autoclass:: tables.indexes.IndexArray + :members: + + +.. _EnumClassDescr: + +The Enum class +-------------- +.. autoclass:: tables.misc.enum.Enum + + +Enum special methods +~~~~~~~~~~~~~~~~~~~~ +.. automethod:: Enum.__call__ + +.. automethod:: Enum.__contains__ + +.. automethod:: Enum.__eq__ + +.. automethod:: Enum.__getattr__ + +.. automethod:: Enum.__getitem__ + +.. automethod:: Enum.__iter__ + +.. automethod:: Enum.__len__ + +.. automethod:: Enum.__repr__ + + +.. _UnImplementedClassDescr: + +The UnImplemented class +----------------------- +.. autoclass:: UnImplemented + :members: + + +The Unknown class +----------------- +.. autoclass:: Unknown + :members: + + +.. _ExceptionsDescr: + +Exceptions module +----------------- +In the :mod:`exceptions` module exceptions and warnings that are specific +to PyTables are declared. + +.. autoexception:: HDF5ExtError + :members: + +.. autoexception:: ClosedNodeError + +.. autoexception:: ClosedFileError + +.. autoexception:: FileModeError + +.. autoexception:: NodeError + +.. autoexception:: NoSuchNodeError + +.. autoexception:: UndoRedoError + +.. autoexception:: UndoRedoWarning + +.. autoexception:: NaturalNameWarning + +.. autoexception:: PerformanceWarning + +.. autoexception:: FlavorError + +.. autoexception:: FlavorWarning + +.. autoexception:: FiltersWarning + +.. autoexception:: OldIndexWarning + +.. autoexception:: DataTypeWarning + +.. autoexception:: ExperimentalFeatureWarning diff --git a/doc/source/usersguide/libref/hierarchy_classes.rst b/doc/source/usersguide/libref/hierarchy_classes.rst new file mode 100644 index 0000000..4d24c91 --- /dev/null +++ b/doc/source/usersguide/libref/hierarchy_classes.rst @@ -0,0 +1,242 @@ +.. currentmodule:: tables + +Hierarchy definition classes +============================ + + +.. _NodeClassDescr: + +The Node class +-------------- +.. autoclass:: Node + +.. These are defined in class docstring + .. autoattribute:: Node._v_depth + .. autoattribute:: Node._v_file + .. autoattribute:: Node._v_name + .. autoattribute:: Node._v_pathname + .. autoattribute:: Node._v_objectid (location independent) + +Node instance variables - location dependent +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autoattribute:: Node._v_parent + + +Node instance variables - location independent +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autoattribute:: Node._v_attrs + +.. autoattribute:: Node._v_isopen + + +Node instance variables - attribute shorthands +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autoattribute:: Node._v_title + + +Node methods - hierarchy manipulation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. automethod:: Node._f_close + +.. automethod:: Node._f_copy + +.. automethod:: Node._f_isvisible + +.. automethod:: Node._f_move + +.. automethod:: Node._f_remove + +.. automethod:: Node._f_rename + + +Node methods - attribute handling +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. automethod:: Node._f_delattr + +.. automethod:: Node._f_getattr + +.. automethod:: Node._f_setattr + + +.. _GroupClassDescr: + +The Group class +--------------- +.. autoclass:: Group + +.. These are defined in the class docstring + Group instance variables + ~~~~~~~~~~~~~~~~~~~~~~~~ + The following instance variables are provided in addition to those in Node + (see :ref:`NodeClassDescr`): + + .. autoattribute:: Group._v_children + .. autoattribute:: Group._v_groups + .. autoattribute:: Group._v_hidden + .. autoattribute:: Group._v_leaves + .. autoattribute:: Group._v_links + .. autoattribute:: Group._v_unknown + +Group properties +~~~~~~~~~~~~~~~~ +.. autoattribute:: Group._v_nchildren + +.. autoattribute:: Group._v_filters + + +Group methods +~~~~~~~~~~~~~ + +.. important:: + + *Caveat:* The following methods are documented for completeness, and they + can be used without any problem. However, you should use the high-level + counterpart methods in the File class (see :ref:`FileClassDescr`, because + they are most used in documentation and examples, and are a bit more + powerful than those exposed here. + +The following methods are provided in addition to those in +Node (see :ref:`NodeClassDescr`): + + +.. automethod:: Group._f_close + +.. automethod:: Group._f_copy + +.. automethod:: Group._f_copy_children + +.. automethod:: Group._f_get_child + +.. automethod:: Group._f_iter_nodes + +.. automethod:: Group._f_list_nodes + +.. automethod:: Group._f_walk_groups + +.. automethod:: Group._f_walknodes + + +Group special methods +~~~~~~~~~~~~~~~~~~~~~ +Following are described the methods that automatically trigger actions when a +Group instance is accessed in a special way. + +This class defines the :meth:`__setattr__`, :meth:`__getattr__` and +:meth:`__delattr__` methods, and they set, get and delete *ordinary Python +attributes* as normally intended. In addition to that, :meth:`__getattr__` +allows getting *child nodes* by their name for the sake of easy interaction +on the command line, as long as there is no Python attribute with the same +name. Groups also allow the interactive completion (when using readline) of +the names of child nodes. For instance:: + + # get a Python attribute + nchild = group._v_nchildren + + # Add a Table child called 'table' under 'group'. + h5file.create_table(group, 'table', my_description) + table = group.table # get the table child instance + group.table = 'foo' # set a Python attribute + + # (PyTables warns you here about using the name of a child node.) + foo = group.table # get a Python attribute + del group.table # delete a Python attribute + table = group.table # get the table child instance again + +.. automethod:: Group.__contains__ + +.. automethod:: Group.__delattr__ + +.. automethod:: Group.__getattr__ + +.. automethod:: Group.__iter__ + +.. automethod:: Group.__repr__ + +.. automethod:: Group.__setattr__ + +.. automethod:: Group.__str__ + + +.. _LeafClassDescr: + +The Leaf class +-------------- +.. autoclass:: Leaf + +.. These are defined in the class docstring + .. _LeafInstanceVariables: + + Leaf instance variables + ~~~~~~~~~~~~~~~~~~~~~~~ + These instance variables are provided in addition to those in Node + (see :ref:`NodeClassDescr`): + + .. autoattribute:: Leaf.byteorder + .. autoattribute:: Leaf.dtype + .. autoattribute:: Leaf.extdim + .. autoattribute:: Leaf.nrows + .. autoattribute:: Leaf.nrowsinbuf + .. autoattribute:: Leaf.shape + + +Leaf properties +~~~~~~~~~~~~~~~ +.. autoattribute:: Leaf.chunkshape + +.. autoattribute:: Leaf.ndim + +.. autoattribute:: Leaf.filters + +.. autoattribute:: Leaf.maindim + +.. autoattribute:: Leaf.flavor + +.. attribute:: Leaf.size_in_memory + + The size of this leaf's data in bytes when it is fully loaded into + memory. + +.. autoattribute:: Leaf.size_on_disk + + +Leaf instance variables - aliases +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The following are just easier-to-write aliases to their Node (see +:ref:`NodeClassDescr`) counterparts (indicated between parentheses): + +.. autoattribute:: Leaf.attrs + +.. autoattribute:: Leaf.name + +.. autoattribute:: Leaf.object_id + +.. autoattribute:: Leaf.title + + +Leaf methods +~~~~~~~~~~~~ +.. automethod:: Leaf.close + +.. automethod:: Leaf.copy + +.. automethod:: Leaf.flush + +.. automethod:: Leaf.isvisible + +.. automethod:: Leaf.move + +.. automethod:: Leaf.rename + +.. automethod:: Leaf.remove + +.. automethod:: Leaf.get_attr + +.. automethod:: Leaf.set_attr + +.. automethod:: Leaf.del_attr + +.. automethod:: Leaf.truncate + +.. automethod:: Leaf.__len__ + +.. automethod:: Leaf._f_close diff --git a/doc/source/usersguide/libref/homogenous_storage.rst b/doc/source/usersguide/libref/homogenous_storage.rst new file mode 100644 index 0000000..04d926c --- /dev/null +++ b/doc/source/usersguide/libref/homogenous_storage.rst @@ -0,0 +1,125 @@ +.. currentmodule:: tables + +Homogenous storage classes +========================== + +.. _ArrayClassDescr: + +The Array class +--------------- +.. autoclass:: Array + + +Array instance variables +~~~~~~~~~~~~~~~~~~~~~~~~ +.. attribute:: Array.atom + + An Atom (see :ref:`AtomClassDescr`) instance representing the *type* + and *shape* of the atomic objects to be saved. + +.. autoattribute:: Array.rowsize + +.. attribute:: Array.nrow + + On iterators, this is the index of the current row. + +.. autoattribute:: Array.nrows + + +Array methods +~~~~~~~~~~~~~ +.. automethod:: Array.get_enum + +.. automethod:: Array.iterrows + +.. automethod:: Array.__next__ + +.. automethod:: Array.read + + +Array special methods +~~~~~~~~~~~~~~~~~~~~~ +The following methods automatically trigger actions when an :class:`Array` +instance is accessed in a special way (e.g. ``array[2:3,...,::2]`` will be +equivalent to a call to +``array.__getitem__((slice(2, 3, None), Ellipsis, slice(None, None, 2))))``. + +.. automethod:: Array.__getitem__ + +.. automethod:: Array.__iter__ + +.. automethod:: Array.__setitem__ + + +.. _CArrayClassDescr: + +The CArray class +---------------- +.. autoclass:: CArray + + +.. _EArrayClassDescr: + +The EArray class +---------------- +.. autoclass:: EArray + + +.. _EArrayMethodsDescr: + +EArray methods +~~~~~~~~~~~~~~ + +.. automethod:: EArray.append + + +.. _VLArrayClassDescr: + +The VLArray class +----------------- +.. autoclass:: VLArray + +.. These are defined in the class docstring + VLArray instance variables + ~~~~~~~~~~~~~~~~~~~~~~~~~~ + .. autoattribute:: VLArray.atom + .. autoattribute:: VLArray.flavor + .. autoattribute:: VLArray.nrow + .. autoattribute:: VLArray.nrows + .. autoattribute:: VLArray.extdim + .. autoattribute:: VLArray.nrows + + +VLArray properties +~~~~~~~~~~~~~~~~~~ +.. autoattribute:: VLArray.size_on_disk + +.. autoattribute:: VLArray.size_in_memory + + +VLArray methods +~~~~~~~~~~~~~~~ +.. automethod:: VLArray.append + +.. automethod:: VLArray.get_enum + +.. automethod:: VLArray.iterrows + +.. automethod:: VLArray.__next__ + +.. automethod:: VLArray.read + +.. automethod:: VLArray.get_row_size + + +VLArray special methods +~~~~~~~~~~~~~~~~~~~~~~~ +The following methods automatically trigger actions when a :class:`VLArray` +instance is accessed in a special way (e.g., vlarray[2:5] will be equivalent +to a call to vlarray.__getitem__(slice(2, 5, None)). + +.. automethod:: VLArray.__getitem__ + +.. automethod:: VLArray.__iter__ + +.. automethod:: VLArray.__setitem__ diff --git a/doc/source/usersguide/libref/link_classes.rst b/doc/source/usersguide/libref/link_classes.rst new file mode 100644 index 0000000..f5c66fb --- /dev/null +++ b/doc/source/usersguide/libref/link_classes.rst @@ -0,0 +1,73 @@ +.. currentmodule:: tables + +Link classes +============ + +.. _LinkClassDescr: + +The Link class +-------------- +.. autoclass:: tables.link.Link + +.. These are defined in the class docstring + .. autoattribute:: tables.link.Link.target + +Link instance variables +~~~~~~~~~~~~~~~~~~~~~~~ +.. autoattribute:: tables.link.Link._v_attrs + + +Link methods +~~~~~~~~~~~~ +The following methods are useful for copying, moving, renaming and removing +links. + +.. automethod:: tables.link.Link.copy + +.. automethod:: tables.link.Link.move + +.. automethod:: tables.link.Link.remove + +.. automethod:: tables.link.Link.rename + + +.. _SoftLinkClassDescr: + +The SoftLink class +------------------ +.. autoclass:: tables.link.SoftLink + + +SoftLink special methods +~~~~~~~~~~~~~~~~~~~~~~~~ +The following methods are specific for dereferencing and representing soft +links. + +.. automethod:: tables.link.SoftLink.__call__ + +.. automethod:: tables.link.SoftLink.__str__ + + +The ExternalLink class +---------------------- +.. autoclass:: tables.link.ExternalLink + +.. This is defined in the class docstring + ExternalLink instance variables + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + .. autoattribute:: tables.link.ExternalLink.extfile + + +ExternalLink methods +~~~~~~~~~~~~~~~~~~~~ +.. automethod:: tables.link.ExternalLink.umount + + +ExternalLink special methods +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The following methods are specific for dereferencing and representing +external links. + +.. automethod:: tables.link.ExternalLink.__call__ + +.. automethod:: tables.link.ExternalLink.__str__ diff --git a/doc/source/usersguide/libref/structured_storage.rst b/doc/source/usersguide/libref/structured_storage.rst new file mode 100644 index 0000000..1f76dde --- /dev/null +++ b/doc/source/usersguide/libref/structured_storage.rst @@ -0,0 +1,261 @@ +.. currentmodule:: tables + +Structured storage classes +========================== + +.. _TableClassDescr: + +The Table class +--------------- +.. autoclass:: Table + +.. These are defined in the class docstring + .. _TableInstanceVariablesDescr: + + Table instance variables + ~~~~~~~~~~~~~~~~~~~~~~~~ + The following instance variables are provided in addition to those in Leaf + (see :ref:`LeafClassDescr`). Please note that there are several col* + dictionaries to ease retrieving information about a column directly by its + path name, avoiding the need to walk through Table.description or + :attr:`Table.cols`. + + .. autoattribute:: Table.coldescrs + .. autoattribute:: Table.coldflts + .. autoattribute:: Table.coldtypes + .. autoattribute:: Table.colindexed + .. autoattribute:: Table.colinstances + .. autoattribute:: Table.colnames + .. autoattribute:: Table.colpathnames + .. autoattribute:: Table.cols + .. autoattribute:: Table.coltypes + .. autoattribute:: Table.description + .. autoattribute:: Table.extdim + .. autoattribute:: Table.indexed + .. autoattribute:: Table.nrows + + +Table properties +~~~~~~~~~~~~~~~~ +.. autoattribute:: Table.autoindex + +.. autoattribute:: Table.colindexes + +.. autoattribute:: Table.indexedcolpathnames + +.. autoattribute:: Table.row + +.. autoattribute:: Table.rowsize + + +Table methods - reading +~~~~~~~~~~~~~~~~~~~~~~~ +.. automethod:: Table.col + +.. automethod:: Table.iterrows + +.. automethod:: Table.itersequence + +.. automethod:: Table.itersorted + +.. automethod:: Table.read + +.. automethod:: Table.read_coordinates + +.. automethod:: Table.read_sorted + +.. automethod:: Table.__getitem__ + +.. automethod:: Table.__iter__ + + +Table methods - writing +~~~~~~~~~~~~~~~~~~~~~~~ +.. automethod:: Table.append + +.. automethod:: Table.modify_column + +.. automethod:: Table.modify_columns + +.. automethod:: Table.modify_coordinates + +.. automethod:: Table.modify_rows + +.. automethod:: Table.remove_rows + +.. automethod:: Table.remove_row + +.. automethod:: Table.__setitem__ + + +.. _TableMethods_querying: + +Table methods - querying +~~~~~~~~~~~~~~~~~~~~~~~~ +.. automethod:: Table.get_where_list + +.. automethod:: Table.read_where + +.. automethod:: Table.where + +.. automethod:: Table.append_where + +.. automethod:: Table.will_query_use_indexing + + +Table methods - other +~~~~~~~~~~~~~~~~~~~~~ +.. automethod:: Table.copy + +.. automethod:: Table.flush_rows_to_index + +.. automethod:: Table.get_enum + +.. automethod:: Table.reindex + +.. automethod:: Table.reindex_dirty + + +.. _DescriptionClassDescr: + +The Description class +~~~~~~~~~~~~~~~~~~~~~ +.. autoclass:: Description + +.. These are defined in the class docstring + Description instance variables + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + .. autoattribute:: Description._v_col_objects + .. autoattribute:: Description._v_dflts + .. autoattribute:: Description._v_dtype + .. autoattribute:: Description._v_dtypes + .. autoattribute:: Description._v_is_nested + .. autoattribute:: Description._v_itemsize + .. autoattribute:: Description._v_name + .. autoattribute:: Description._v_names + .. autoattribute:: Description._v_nested_descr + .. autoattribute:: Description._v_nested_formats + .. autoattribute:: Description._v_nestedlvl + .. autoattribute:: Description._v_nested_names + .. autoattribute:: Description._v_pathname + .. autoattribute:: Description._v_pathnames + .. autoattribute:: Description._v_types + .. autoattribute:: Description._v_offsets + + +Description methods +^^^^^^^^^^^^^^^^^^^ +.. automethod:: Description._f_walk + + +.. _RowClassDescr: + +The Row class +~~~~~~~~~~~~~ +.. autoclass:: tables.tableextension.Row + +.. These are defined in the class docstring + Row instance variables + ^^^^^^^^^^^^^^^^^^^^^^ + .. autoattribute:: tables.tableextension.Row.nrow + + +Row methods +^^^^^^^^^^^ +.. automethod:: tables.tableextension.Row.append + +.. automethod:: tables.tableextension.Row.fetch_all_fields + +.. automethod:: tables.tableextension.Row.update + + +.. _RowSpecialMethods: + +Row special methods +^^^^^^^^^^^^^^^^^^^ +.. automethod:: tables.tableextension.Row.__contains__ + +.. automethod:: tables.tableextension.Row.__getitem__ + +.. automethod:: tables.tableextension.Row.__setitem__ + + +.. _ColsClassDescr: + +The Cols class +~~~~~~~~~~~~~~ +.. autoclass:: Cols + +.. These are defined in the class docstring + Cols instance variables + ^^^^^^^^^^^^^^^^^^^^^^^ + .. autoattribute:: Cols._v_colnames + .. autoattribute:: Cols._v_colpathnames + .. autoattribute:: Cols._v_desc + + +Cols properties +^^^^^^^^^^^^^^^ +.. autoattribute:: Cols._v_table + + +Cols methods +^^^^^^^^^^^^ +.. automethod:: Cols._f_col + +.. automethod:: Cols.__getitem__ + +.. automethod:: Cols.__len__ + +.. automethod:: Cols.__setitem__ + + +.. _ColumnClassDescr: + +The Column class +~~~~~~~~~~~~~~~~ +.. autoclass:: Column + +.. These are defined in the class docstring + + .. autoattribute:: Column.descr + .. autoattribute:: Column.name + .. autoattribute:: Column.pathname + +Column instance variables +^^^^^^^^^^^^^^^^^^^^^^^^^ +.. autoattribute:: Column.dtype + +.. autoattribute:: Column.index + +.. autoattribute:: Column.is_indexed + +.. autoattribute:: Column.maindim + +.. autoattribute:: Column.shape + +.. autoattribute:: Column.table + +.. autoattribute:: Column.type + + +Column methods +^^^^^^^^^^^^^^ +.. automethod:: Column.create_index + +.. automethod:: Column.create_csindex + +.. automethod:: Column.reindex + +.. automethod:: Column.reindex_dirty + +.. automethod:: Column.remove_index + + +Column special methods +^^^^^^^^^^^^^^^^^^^^^^ +.. automethod:: Column.__getitem__ + +.. automethod:: Column.__len__ + +.. automethod:: Column.__setitem__ diff --git a/doc/source/usersguide/libref/top_level.rst b/doc/source/usersguide/libref/top_level.rst new file mode 100644 index 0000000..6d7fe4a --- /dev/null +++ b/doc/source/usersguide/libref/top_level.rst @@ -0,0 +1,36 @@ +.. currentmodule:: tables + +Top-level variables and functions +================================= + +Global variables +---------------- + +.. autodata:: __version__ + +.. autodata:: hdf5_version + + +Global functions +---------------- +.. autofunction:: copy_file + +.. autofunction:: is_hdf5_file + +.. autofunction:: is_pytables_file + +.. autofunction:: open_file + +.. autofunction:: set_blosc_max_threads + +.. autofunction:: print_versions + +.. autofunction:: restrict_flavors + +.. autofunction:: split_type + +.. autofunction:: test + +.. autofunction:: which_lib_version + +.. autofunction:: silence_hdf5_messages diff --git a/doc/source/usersguide/optimization.rst b/doc/source/usersguide/optimization.rst new file mode 100644 index 0000000..5b6335f --- /dev/null +++ b/doc/source/usersguide/optimization.rst @@ -0,0 +1,1138 @@ +Optimization tips +================= +.. epigraph:: + + ... durch planmässiges Tattonieren. + + [... through systematic, palpable experimentation.] + + -- Johann Karl Friedrich Gauss [asked how he came upon his theorems] + +.. currentmodule:: tables + +On this chapter, you will get deeper knowledge of PyTables internals. +PyTables has many tunable features so that you can improve the performance of +your application. If you are planning to deal with really large data, you +should read carefully this section in order to learn how to get an important +efficiency boost for your code. But if your datasets are small (say, up to +10 MB) or your number of nodes is contained (up to 1000), you should not +worry about that as the default parameters in PyTables are already tuned for +those sizes (although you may want to adjust them further anyway). At any +rate, reading this chapter will help you in your life with PyTables. + + +Understanding chunking +---------------------- +The underlying HDF5 library that is used by PyTables allows for certain +datasets (the so-called *chunked* datasets) to take the data in bunches of a +certain length, named *chunks*, and write them on disk as a whole, i.e. the +HDF5 library treats chunks as atomic objects and disk I/O is always made in +terms of complete chunks. This allows data filters to be defined by the +application to perform tasks such as compression, encryption, check-summing, +etc. on entire chunks. + +HDF5 keeps a B-tree in memory that is used to map chunk structures on disk. +The more chunks that are allocated for a dataset the larger the B-tree. +Large B-trees take memory and cause file storage overhead as well as more +disk I/O and higher contention for the metadata cache. Consequently, it's +important to balance between memory and I/O overhead (small B-trees) and time +to access data (big B-trees). + +In the next couple of sections, you will discover how to inform PyTables +about the expected size of your datasets for allowing a sensible computation +of the chunk sizes. Also, you will be presented some experiments so that you +can get a feeling on the consequences of manually specifying the chunk size. +Although doing this latter is only reserved to experienced people, these +benchmarks may allow you to understand more deeply the chunk size +implications and let you quickly start with the fine-tuning of this important +parameter. + + +.. _expectedRowsOptim: + +Informing PyTables about expected number of rows in tables or arrays +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +PyTables can determine a sensible chunk size to your dataset size if you +help it by providing an estimation of the final number of rows for an +extensible leaf [1]_. You should provide this information at leaf creation +time by passing this value to the ``expectedrows`` argument of the +:meth:`File.create_table` method or :meth:`File.create_earray` method (see +:ref:`EArrayClassDescr`). + +When your leaf size is bigger than 10 MB (take this figure only as a +reference, not strictly), by providing this guess you will be optimizing the +access to your data. When the table or array size is larger than, say 100MB, +you are *strongly* suggested to provide such a guess; failing to do that may +cause your application to do very slow I/O operations and to demand *huge* +amounts of memory. You have been warned! + + +.. _chunksizeFineTune: + +Fine-tuning the chunksize +~~~~~~~~~~~~~~~~~~~~~~~~~ +.. warning:: + + This section is mostly meant for experts. If you are a beginner, you + must know that setting manually the chunksize is a potentially dangerous + action. + +Most of the time, informing PyTables about the extent of your dataset is +enough. However, for more sophisticated applications, when one has special +requirements for doing the I/O or when dealing with really large datasets, +you should really understand the implications of the chunk size in order to +be able to find the best value for your own application. + +You can specify the chunksize for every chunked dataset in PyTables by +passing the chunkshape argument to the corresponding constructors. It is +important to point out that chunkshape is not exactly the same thing than a +chunksize; in fact, the chunksize of a dataset can be computed multiplying +all the dimensions of the chunkshape among them and multiplying the outcome +by the size of the atom. + +We are going to describe a series of experiments where an EArray of 15 GB is +written with different chunksizes, and then it is accessed in both sequential +(i.e. first element 0, then element 1 and so on and so forth until the data +is exhausted) and random mode (i.e. single elements are read randomly all +through the dataset). These benchmarks have been carried out with +PyTables 2.1 on a machine with an Intel Core2 processor @ 3 GHz and a RAID-0 +made of two SATA disks spinning at 7200 RPM, and using GNU/Linux with an XFS +filesystem. The script used for the benchmarks is available in +bench/optimal-chunksize.py. + +In figures :ref:`Figure 1 `, +:ref:`Figure 2 `, :ref:`Figure 3 ` +and :ref:`Figure 4 `, you can see how the chunksize +affects different aspects, like creation time, file sizes, sequential read +time and random read time. So, if you properly inform PyTables about the +extent of your datasets, you will get an automatic chunksize value (256 KB in +this case) that is pretty optimal for most of uses. However, if what you +want is, for example, optimize the creation time when using the +Zlib compressor, you may want to reduce the chunksize to 32 KB (see +:ref:`Figure 1 `). Or, if your goal is to optimize the +sequential access time for an dataset compressed with Blosc, you may want to +increase the chunksize to 512 KB (see :ref:`Figure 3 `). + +You will notice that, by manually specifying the chunksize of a leave you +will not normally get a drastic increase in performance, but at least, you +have the opportunity to fine-tune such an important parameter for improve +performance. + +.. _createTime-chunksize: + +.. figure:: images/create-chunksize-15GB.png + :align: center + + **Figure 1. Creation time per element for a 15 GB EArray and different + chunksizes.** + + +.. _fileSizes-chunksize: + +.. figure:: images/filesizes-chunksize-15GB.png + :align: center + + **Figure 2. File sizes for a 15 GB EArray and different chunksizes.** + +.. _seqTime-chunksize: + + +.. figure:: images/seq-chunksize-15GB.png + :align: center + + **Figure 3. Sequential access time per element for a 15 GB EArray and + different chunksizes.** + + +.. _randomTime-chunksize: + +.. figure:: images/random-chunksize-15GB.png + :align: center + + **Figure 4. Random access time per element for a 15 GB EArray and + different chunksizes.** + + +Finally, it is worth noting that adjusting the chunksize can be specially +important if you want to access your dataset by blocks of certain dimensions. +In this case, it is normally a good idea to set your chunkshape to be the +same than these dimensions; you only have to be careful to not end with a too +small or too large chunksize. As always, experimenting prior to pass your +application into production is your best ally. + + +.. _searchOptim: + +Accelerating your searches +-------------------------- + +.. note:: + + Many of the explanations and plots in this section and the forthcoming + ones still need to be updated to include Blosc (see + :ref:`[BLOSC] `), the new and powerful compressor added in + PyTables 2.2 series. You should expect it to be the fastest compressor + among all the described here, and its use is strongly recommended + whenever you need extreme speed and not a very high compression ratio. + +Searching in tables is one of the most common and time consuming operations +that a typical user faces in the process of mining through his data. Being +able to perform queries as fast as possible will allow more opportunities for +finding the desired information quicker and also allows to deal with larger +datasets. + +PyTables offers many sort of techniques so as to speed-up the search process +as much as possible and, in order to give you hints to use them based, a +series of benchmarks have been designed and carried out. All the results +presented in this section have been obtained with synthetic, random data and +using PyTables 2.1. Also, the tests have been conducted on a machine with an +Intel Core2 (64-bit) @ 3 GHz processor with RAID-0 disk storage (made of four +spinning disks @ 7200 RPM), using GNU/Linux with an XFS filesystem. The +script used for the benchmarks is available in bench/indexed_search.py. +As your data, queries and platform may be totally different for your case, +take this just as a guide because your mileage may vary (and will vary). + +In order to be able to play with tables with a number of rows as large as +possible, the record size has been chosen to be rather small (24 bytes). Here +it is its definition:: + + class Record(tables.IsDescription): + col1 = tables.Int32Col() + col2 = tables.Int32Col() + col3 = tables.Float64Col() + col4 = tables.Float64Col() + +In the next sections, we will be optimizing the times for a relatively +complex query like this:: + + result = [row['col2'] for row in table if ( + ((row['col4'] >= lim1 and row['col4'] < lim2) or + ((row['col2'] > lim3 and row['col2'] < lim4])) and + ((row['col1']+3.1*row['col2']+row['col3']*row['col4']) > lim5) + )] + +(for future reference, we will call this sort of queries *regular* queries). +So, if you want to see how to greatly improve the time taken to run queries +like this, keep reading. + + +.. _inkernelSearch: + +In-kernel searches +~~~~~~~~~~~~~~~~~~ +PyTables provides a way to accelerate data selections inside of a single +table, through the use of the :ref:`TableMethods_querying` iterator and +related query methods. This mode of selecting data is called *in-kernel*. +Let's see an example of an *in-kernel* query based on the *regular* one +mentioned above:: + + result = [row['col2'] for row in table.where( + '''(((col4 >= lim1) & (col4 < lim2)) | + ((col2 > lim3) & (col2 < lim4)) & + ((col1+3.1*col2+col3*col4) > lim5))''')] + +This simple change of mode selection can improve search times quite a lot and +actually make PyTables very competitive when compared against typical +relational databases as you can see in :ref:`Figure 5 ` +and :ref:`Figure 6 `. + +.. _sequentialTimes-10m: + +.. figure:: images/Q7-10m-noidx.png + :align: center + + **Figure 5. Times for non-indexed complex queries in a small table with + 10 millions of rows: the data fits in memory.** + +By looking at :ref:`Figure 5 ` you can see how in the +case that table data fits easily in memory, in-kernel searches on +uncompressed tables are generally much faster (10x) than standard queries as +well as PostgreSQL (5x). Regarding compression, we can see how Zlib +compressor actually slows down the performance of in-kernel queries by a +factor 3.5x; however, it remains faster than PostgreSQL (40%). +On his hand, LZO compressor only decreases the performance by a 75% with +respect to uncompressed in-kernel queries and is still a lot faster than +PostgreSQL (3x). Finally, one can observe that, for low selectivity queries +(large number of hits), PostgreSQL performance degrades quite steadily, while +in PyTables this slow down rate is significantly smaller. The reason of this +behaviour is not entirely clear to the authors, but the fact is clearly +reproducible in our benchmarks. + +But, why in-kernel queries are so fast when compared with regular ones?. +The answer is that in regular selection mode the data for all the rows in +table has to be brought into Python space so as to evaluate the condition and +decide if the corresponding field should be added to the result list. On the +contrary, in the in-kernel mode, the condition is passed to the PyTables +kernel (hence the name), written in C, and evaluated there at full C speed +(with the help of the integrated Numexpr package, see +:ref:`[NUMEXPR] `), so that the only values that are brought to +Python space are the rows that fulfilled the condition. Hence, for +selections that only have a relatively small number of hits (compared with +the total amount of rows), the savings are very large. It is also +interesting to note the fact that, although for queries with a large number +of hits the speed-up is not as high, it is still very important. + +On the other hand, when the table is too large to fit in memory (see +:ref:`Figure 6 `), the difference in speed between +regular and in-kernel is not so important, but still significant (2x). Also, +and curiously enough, large tables compressed with Zlib offers slightly +better performance (around 20%) than uncompressed ones; this is because the +additional CPU spent by the uncompressor is compensated by the savings in +terms of net I/O (one has to read less actual data from disk). However, when +using the extremely fast LZO compressor, it gives a clear advantage over +Zlib, and is up to 2.5x faster than not using compression at all. The reason +is that LZO decompression speed is much faster than Zlib, and that allows +PyTables to read the data at full disk speed (i.e. the bottleneck is in the +I/O subsystem, not in the CPU). In this case the compression rate is around +2.5x, and this is why the data can be read 2.5x faster. So, in general, +using the LZO compressor is the best way to ensure best reading/querying +performance for out-of-core datasets (more about how compression affects +performance in :ref:`compressionIssues`). + +.. _sequentialTimes-1g: + +.. figure:: images/Q8-1g-noidx.png + :align: center + + **Figure 6. Times for non-indexed complex queries in a large table with 1 + billion of rows: the data does not fit in memory.** + +Furthermore, you can mix the *in-kernel* and *regular* selection modes for +evaluating arbitrarily complex conditions making use of external functions. +Look at this example:: + + result = [ row['var2'] + for row in table.where('(var3 == "foo") & (var1 <= 20)') + if your_function(row['var2']) ] + +Here, we use an *in-kernel* selection to choose rows according to the values +of the var3 and var1 fields. Then, we apply a *regular* selection to +complete the query. Of course, when you mix the *in-kernel* and *regular* +selection modes you should pass the most restrictive condition to the +*in-kernel* part, i.e. to the where() iterator. In situations where it is +not clear which is the most restrictive condition, you might want to +experiment a bit in order to find the best combination. + +However, since in-kernel condition strings allow rich expressions allowing +the coexistence of multiple columns, variables, arithmetic operations and +many typical functions, it is unlikely that you will be forced to use +external regular selections in conditions of small to medium complexity. +See :ref:`condition_syntax` for more information on in-kernel condition +syntax. + + +Indexed searches +~~~~~~~~~~~~~~~~ +When you need more speed than *in-kernel* selections can offer you, PyTables +offers a third selection method, the so-called *indexed* mode (based on the +highly efficient OPSI indexing engine ). In this mode, you have to decide +which column(s) you are going to apply your selections over, and index them. +Indexing is just a kind of sorting operation over a column, so that searches +along such a column (or columns) will look at this sorted information by +using a *binary search* which is much faster than the *sequential search* +described in the previous section. + +You can index the columns you want by calling the :meth:`Column.create_index` +method on an already created table. For example:: + + indexrows = table.cols.var1.create_index() + indexrows = table.cols.var2.create_index() + indexrows = table.cols.var3.create_index() + +will create indexes for all var1, var2 and var3 columns. + +After you have indexed a series of columns, the PyTables query optimizer will +try hard to discover the usable indexes in a potentially complex expression. +However, there are still places where it cannot determine that an index can +be used. See below for examples where the optimizer can safely determine if +an index, or series of indexes, can be used or not. + +Example conditions where an index can be used: + +- var1 >= "foo" (var1 is used) + +- var1 >= mystr (var1 is used) + +- (var1 >= "foo") & (var4 > 0.0) (var1 is used) + +- ("bar" <= var1) & (var1 < "foo") (var1 is used) + +- (("bar" <= var1) & (var1 < "foo")) & (var4 > 0.0) (var1 is used) + +- (var1 >= "foo") & (var3 > 10) (var1 and var3 are used) + +- (var1 >= "foo") | (var3 > 10) (var1 and var3 are used) + +- ~(var1 >= "foo") | ~(var3 > 10) (var1 and var3 are used) + +Example conditions where an index can *not* be used: + +- var4 > 0.0 (var4 is not indexed) + +- var1 != 0.0 (range has two pieces) + +- ~(("bar" <= var1) & (var1 < "foo")) & (var4 > 0.0) (negation of a complex boolean expression) + +.. note:: From PyTables 2.3 on, several indexes can be used in a single query. + +.. note:: + + If you want to know for sure whether a particular query will use indexing + or not (without actually running it), you are advised to use the + :meth:`Table.will_query_use_indexing` method. + +One important aspect of the new indexing in PyTables (>= 2.3) is that it has +been designed from the ground up with the goal of being capable to +effectively manage very large tables. To this goal, it sports a wide +spectrum of different quality levels (also called optimization levels) for +its indexes so that the user can choose the best one that suits her needs +(more or less size, more or less performance). + +In :ref:`Figure 7 `, you can see that the times to index +columns in tables can be really short. In particular, the time to index a +column with 1 billion rows (1 Gigarow) with the lowest optimization level is +less than 4 minutes while indexing the same column with full optimization (so +as to get a completely sorted index or CSI) requires around 1 hour. These +are rather competitive figures compared with a relational database (in this +case, PostgreSQL 8.3.1, which takes around 1.5 hours for getting the index +done). This is because PyTables is geared towards read-only or append-only +tables and takes advantage of this fact to optimize the indexes properly. On +the contrary, most relational databases have to deliver decent performance in +other scenarios as well (specially updates and deletions), and this fact +leads not only to slower index creation times, but also to indexes taking +much more space on disk, as you can see in :ref:`Figure 8 `. + +.. _createIndexTimes: + +.. figure:: images/create-index-time-int32-float64.png + :align: center + + **Figure 7. Times for indexing an Int32 and Float64 column.** + + +.. _indexSizes: + +.. figure:: images/indexes-sizes2.png + :align: center + + **Figure 8. Sizes for an index of a Float64 column with 1 billion of rows.** + + +The user can select the index quality by passing the desired optlevel and +kind arguments to the :meth:`Column.create_index` method. We can see in +figures :ref:`Figure 7 ` and :ref:`Figure 8 ` +how the different optimization levels affects index time creation and index +sizes. + +So, which is the effect of the different optimization levels in terms of +query times? You can see that in :ref:`Figure 9 `. + +.. _queryTimes-indexed-optlevels: + +.. figure:: images/Q8-1g-idx-optlevels.png + :align: center + + **Figure 9. Times for complex queries with a cold cache (mean of 5 first + random queries) for different optimization levels. Benchmark made on a machine with Intel Core2 (64-bit) @ 3 GHz processor with RAID-0 disk storage.** + +Of course, compression also has an effect when doing indexed queries, +although not very noticeable, as can be seen in +:ref:`Figure 10 `. +As you can see, the difference between using no compression and using Zlib or +LZO is very little, although LZO achieves relatively better performance +generally speaking. + +.. _queryTimes-indexed-compress: + +.. figure:: images/Q8-1g-idx-compress.png + :align: center + + **Figure 10. Times for complex queries with a cold cache (mean of 5 first + random queries) for different compressors.** + +You can find a more complete description and benchmarks about OPSI, the +indexing system of PyTables (>= 2.3) in :ref:`[OPSI] `. + + +Indexing and Solid State Disks (SSD) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Lately, the long promised Solid State Disks (SSD for brevity) with decent +capacities and affordable prices have finally hit the market and will +probably stay in coexistence with the traditional spinning disks for the +foreseeable future (separately or forming *hybrid* systems). SSD have many +advantages over spinning disks, like much less power consumption and better +throughput. But of paramount importance, specially in the context of +accelerating indexed queries, is its very reduced latency during disk seeks, +which is typically 100x better than traditional disks. +Such a huge improvement has to have a clear impact in reducing the query +times, specially when the selectivity is high (i.e. the number of hits is +small). + +In order to offer an estimate on the performance improvement we can expect +when using a low-latency SSD instead of traditional spinning disks, the +benchmark in the previous section has been repeated, but this time using a +single SSD disk instead of the four spinning disks in RAID-0. The result can +be seen in :ref:`Figure 11 `. There one can see how +a query in a table of 1 billion of rows with 100 hits took just 1 tenth of +second when using a SSD, instead of 1 second that needed the RAID made of +spinning disks. This factor of 10x of speed-up for high-selectivity queries +is nothing to sneeze at, and should be kept in mind when really high +performance in queries is needed. It is also interesting that using +compression with LZO does have a clear advantage over when no compression is +done. + +.. _queryTimes-indexed-SSD: + +.. figure:: images/Q8-1g-idx-SSD.png + :align: center + + **Figure 11. Times for complex queries with a cold cache (mean of 5 first + random queries) for different disk storage (SSD vs spinning disks).** + +Finally, we should remark that SSD can't compete with traditional spinning +disks in terms of capacity as they can only provide, for a similar cost, +between 1/10th and 1/50th of the size of traditional disks. It is here where +the compression capabilities of PyTables can be very helpful because both +tables and indexes can be compressed and the final space can be reduced by +typically 2x to 5x (4x to 10x when compared with traditional relational +databases). +Best of all, as already mentioned, performance is not degraded when +compression is used, but actually *improved*. +So, by using PyTables and SSD you can query larger datasets that otherwise +would require spinning disks when using other databases + +In fact, we were unable to run the PostgreSQL benchmark in this case because +the space needed exceeded the capacity of our SSD., while allowing +improvements in the speed of indexed queries between 2x (for medium to low +selectivity queries) and 10x (for high selectivity queries). + + +Achieving ultimate speed: sorted tables and beyond +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. warning:: + + Sorting a large table is a costly operation. The next procedure should + only be performed when your dataset is mainly read-only and meant to be + queried many times. + +When querying large tables, most of the query time is spent in locating the +interesting rows to be read from disk. In some occasions, you may have +queries whose result depends *mainly* of one single column (a query with only +one single condition is the trivial example), so we can guess that sorting +the table by this column would lead to locate the interesting rows in a much +more efficient way (because they would be mostly *contiguous*). We are going +to confirm this guess. + +For the case of the query that we have been using in the previous sections:: + + result = [row['col2'] for row in table.where( + '''(((col4 >= lim1) & (col4 < lim2)) | + ((col2 > lim3) & (col2 < lim4)) & + ((col1+3.1*col2+col3*col4) > lim5))''')] + +it is possible to determine, by analysing the data distribution and the query +limits, that col4 is such a *main column*. So, by ordering the table by the +col4 column (for example, by specifying setting the column to sort by in the +sortby parameter in the :meth:`Table.copy` method and re-indexing col2 and +col4 afterwards, we should get much faster performance for our query. This +is effectively demonstrated in :ref:`Figure 12 `, +where one can see how queries with a low to medium (up to 10000) number of +hits can be done in around 1 tenth of second for a RAID-0 setup and in around +1 hundredth of second for a SSD disk. This represents up to more that 100x +improvement in speed with respect to the times with unsorted tables. On the +other hand, when the number of hits is large (> 1 million), the query times +grow almost linearly, showing a near-perfect scalability for both RAID-0 and +SSD setups (the sequential access to disk becomes the bottleneck in this +case). + +.. _queryTimes-indexed-sorted: + +.. figure:: images/Q8-1g-idx-sorted.png + :align: center + + **Figure 12. Times for complex queries with a cold cache (mean of 5 first + random queries) for unsorted and sorted tables.** + +Even though we have shown many ways to improve query times that should +fulfill the needs of most of people, for those needing more, you can for sure +discover new optimization opportunities. For example, querying against +sorted tables is limited mainly by sequential access to data on disk and data +compression capability, so you may want to read :ref:`chunksizeFineTune`, for +ways on improving this aspect. +Reading the other sections of this chapter will help in finding new roads for +increasing the performance as well. You know, the limit for stopping the +optimization process is basically your imagination (but, most plausibly, your +available time ;-). + + +.. _compressionIssues: + +Compression issues +------------------ +One of the beauties of PyTables is that it supports compression on tables and +arrays [2]_, although it is not used by default. Compression of big amounts +of data might be a bit controversial feature, because it has a legend of +being a very big consumer of CPU time resources. However, if you are willing +to check if compression can help not only by reducing your dataset file size +but *also* by improving I/O efficiency, specially when dealing with very +large datasets, keep reading. + + +A study on supported compression libraries +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The compression library used by default is the *Zlib* (see +:ref:`[ZLIB] `). Since HDF5 *requires* it, you can safely use it and +expect that your HDF5 files will be readable on any other platform that has +HDF5 libraries installed. Zlib provides good compression ratio, although +somewhat slow, and reasonably fast decompression. Because of that, it is a +good candidate to be used for compressing you data. + +However, in some situations it is critical to have a *very good decompression +speed* (at the expense of lower compression ratios or more CPU wasted on +compression, as we will see soon). In others, the emphasis is put in +achieving the *maximum compression ratios*, no matter which reading speed +will result. This is why support for two additional compressors has been +added to PyTables: LZO (see :ref:`[LZO] `) and bzip2 (see +:ref:`[BZIP2] `). Following the author of LZO (and checked by the +author of this section, as you will see soon), LZO offers pretty fast +compression and extremely fast decompression. In fact, LZO is so fast when +compressing/decompressing that it may well happen (that depends on your data, +of course) that writing or reading a compressed dataset is sometimes faster +than if it is not compressed at all (specially when dealing with extremely +large datasets). This fact is very important, specially if you have to deal +with very large amounts of data. Regarding bzip2, it has a reputation of +achieving excellent compression ratios, but at the price of spending much +more CPU time, which results in very low compression/decompression speeds. + +Be aware that the LZO and bzip2 support in PyTables is not standard on HDF5, +so if you are going to use your PyTables files in other contexts different +from PyTables you will not be able to read them. Still, see the +:ref:`ptrepackDescr` (where the ptrepack utility is described) to find a way +to free your files from LZO or bzip2 dependencies, so that you can use these +compressors locally with the warranty that you can replace them with Zlib (or +even remove compression completely) if you want to use these files with other +HDF5 tools or platforms afterwards. + +In order to allow you to grasp what amount of compression can be achieved, +and how this affects performance, a series of experiments has been carried +out. All the results presented in this section (and in the next one) have +been obtained with synthetic data and using PyTables 1.3. Also, the tests +have been conducted on a IBM OpenPower 720 (e-series) with a PowerPC G5 at +1.65 GHz and a hard disk spinning at 15K RPM. As your data and platform may +be totally different for your case, take this just as a guide because your +mileage may vary. Finally, and to be able to play with tables with a number +of rows as large as possible, the record size has been chosen to be small (16 +bytes). Here is its definition:: + + class Bench(IsDescription): + var1 = StringCol(length=4) + var2 = IntCol() + var3 = FloatCol() + +With this setup, you can look at the compression ratios that can be achieved +in :ref:`Figure 13 `. As you can see, LZO is the +compressor that performs worse in this sense, but, curiously enough, there is +not much difference between Zlib and bzip2. + +.. _comprTblComparison: + +.. figure:: images/compressed-recordsize.png + :align: center + + **Figure 13. Comparison between different compression libraries.** + +Also, PyTables lets you select different compression levels for Zlib and +bzip2, although you may get a bit disappointed by the small improvement that +these compressors show when dealing with a combination of numbers and strings +as in our example. As a reference, see plot +:ref:`Figure 14 ` for a comparison of the compression +achieved by selecting different levels of Zlib. Very oddly, the best +compression ratio corresponds to level 1 (!). See later for an explanation +and more figures on this subject. + +.. _comprZlibComparison: + +.. figure:: images/compressed-recordsize-zlib.png + :align: center + + **Figure 14. Comparison between different compression levels of Zlib.** + +Have also a look at :ref:`Figure 15 `. It shows how the +speed of writing rows evolves as the size (number of rows) of the table +grows. Even though in these graphs the size of one single row is 16 bytes, +you can most probably extrapolate these figures to other row sizes. + +.. _comprWriteComparison: + +.. figure:: images/compressed-writing.png + :align: center + + **Figure 15. Writing tables with several compressors.** + +In :ref:`Figure 16 ` you can see how compression +affects the reading performance. In fact, what you see in the plot is an +*in-kernel selection* speed, but provided that this operation is very fast +(see :ref:`inkernelSearch`), we can accept it as an actual read test. +Compared with the reference line without compression, the general trend here +is that LZO does not affect too much the reading performance (and in some +points it is actually better), Zlib makes speed drop to a half, while bzip2 +is performing very slow (up to 8x slower). + +Also, in the same :ref:`Figure 16 ` you can +notice some strange peaks in the speed that we might be tempted to attribute +to libraries on which PyTables relies (HDF5, compressors...), or to PyTables +itself. +However, :ref:`Figure 17 ` reveals that, if we put +the file in the filesystem cache (by reading it several times before, for +example), the evolution of the performance is much smoother. So, the most +probable explanation would be that such peaks are a consequence of the +underlying OS filesystem, rather than a flaw in PyTables (or any other +library behind it). Another consequence that can be derived from the +aforementioned plot is that LZO decompression performance is much better than +Zlib, allowing an improvement in overall speed of more than 2x, and perhaps +more important, the read performance for really large datasets (i.e. when +they do not fit in the OS filesystem cache) can be actually *better* than not +using compression at all. Finally, one can see that reading performance is +very badly affected when bzip2 is used (it is 10x slower than LZO and 4x than +Zlib), but this was somewhat expected anyway. + +.. _comprReadNoCacheComparison: + +.. figure:: images/compressed-select-nocache.png + :align: center + + **Figure 16. Selecting values in tables with several compressors. + The file is not in the OS cache.** + + +.. _comprReadCacheComparison: + +.. figure:: images/compressed-select-cache.png + :align: center + + **Figure 17. Selecting values in tables with several compressors. + The file is in the OS cache.** + +So, generally speaking and looking at the experiments above, you can expect +that LZO will be the fastest in both compressing and decompressing, but the +one that achieves the worse compression ratio (although that may be just OK +for many situations, specially when used with shuffling - see +:ref:`ShufflingOptim`). bzip2 is the slowest, by large, in both compressing +and decompressing, and besides, it does not achieve any better compression +ratio than Zlib. Zlib represents a balance between them: it's somewhat slow +compressing (2x) and decompressing (3x) than LZO, but it normally achieves +better compression ratios. + +Finally, by looking at the plots :ref:`Figure 18 `, +:ref:`Figure 19 `, and the aforementioned +:ref:`Figure 14 ` you can see why the recommended +compression level to use for all compression libraries is 1. This is the +lowest level of compression, but as the size of the underlying HDF5 chunk +size is normally rather small compared with the size of compression buffers, +there is not much point in increasing the latter (i.e. increasing the +compression level). Nonetheless, in some situations (like for example, in +extremely large tables or arrays, where the computed chunk size can be rather +large) you may want to check, on your own, how the different compression +levels do actually affect your application. + +You can select the compression library and level by setting the complib and +complevel keywords in the Filters class (see :ref:`FiltersClassDescr`). A +compression level of 0 will completely disable compression (the default), 1 +is the less memory and CPU time demanding level, while 9 is the maximum level +and the most memory demanding and CPU intensive. Finally, have in mind that +LZO is not accepting a compression level right now, so, when using LZO, 0 +means that compression is not active, and any other value means that LZO is +active. + +So, in conclusion, if your ultimate goal is writing and reading as fast as +possible, choose LZO. If you want to reduce as much as possible your data, +while retaining acceptable read speed, choose Zlib. Finally, if portability +is important for you, Zlib is your best bet. So, when you want to use bzip2? +Well, looking at the results, it is difficult to recommend its use in +general, but you may want to experiment with it in those cases where you know +that it is well suited for your data pattern (for example, for dealing with +repetitive string datasets). + +.. _comprWriteZlibComparison: + +.. figure:: images/compressed-writing-zlib.png + :align: center + + **Figure 18. Writing in tables with different levels of compression.** + +.. _comprReadZlibComparison: + +.. figure:: images/compressed-select-cache-zlib.png + :align: center + + **Figure 19. Selecting values in tables with different levels of + compression. The file is in the OS cache.** + + +.. _ShufflingOptim: + +Shuffling (or how to make the compression process more effective) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The HDF5 library provides an interesting filter that can leverage the results +of your favorite compressor. Its name is *shuffle*, and because it can +greatly benefit compression and it does not take many CPU resources (see +below for a justification), it is active *by default* in PyTables whenever +compression is activated (independently of the chosen compressor). It is +deactivated when compression is off (which is the default, as you already +should know). Of course, you can deactivate it if you want, but this is not +recommended. + +.. note:: + + Since PyTables 3.3, a new *bitshuffle* filter for Blosc compressor + has been added. Contrarily to *shuffle* that shuffles bytes, + *bitshuffle* shuffles the chunk data at bit level which **could** + improve compression ratios at the expense of some speed penalty. + Look at the :ref:`FiltersClassDescr` documentation on how to + activate bitshuffle and experiment with it so as to decide if it + can be useful for you. + + +So, how does this mysterious filter exactly work? From the HDF5 reference +manual:: + + "The shuffle filter de-interlaces a block of data by reordering the + bytes. All the bytes from one consistent byte position of each data + element are placed together in one block; all bytes from a second + consistent byte position of each data element are placed together a + second block; etc. For example, given three data elements of a 4-byte + datatype stored as 012301230123, shuffling will re-order data as + 000111222333. This can be a valuable step in an effective compression + algorithm because the bytes in each byte position are often closely + related to each other and putting them together can increase the + compression ratio." + +In :ref:`Figure 20 ` you can see a benchmark that +shows how the *shuffle* filter can help the different libraries in +compressing data. In this experiment, shuffle has made LZO compress almost 3x +more (!), while Zlib and bzip2 are seeing improvements of 2x. Once again, the +data for this experiment is synthetic, and *shuffle* seems to do a great work +with it, but in general, the results will vary in each case [3]_. + +.. _comprShuffleComparison: + +.. figure:: images/compressed-recordsize-shuffle.png + :align: center + + **Figure 20. Comparison between different compression libraries with and + without the shuffle filter.** + +At any rate, the most remarkable fact about the *shuffle* filter is the +relatively high level of compression that compressor filters can achieve when +used in combination with it. A curious thing to note is that the Bzip2 +compression rate does not seem very much improved (less than a 40%), and what +is more striking, Bzip2+shuffle does compress quite *less* than Zlib+shuffle +or LZO+shuffle combinations, which is kind of unexpected. The thing that +seems clear is that Bzip2 is not very good at compressing patterns that +result of shuffle application. As always, you may want to experiment with +your own data before widely applying the Bzip2+shuffle combination in order +to avoid surprises. + +Now, how does shuffling affect performance? Well, if you look at plots +:ref:`Figure 21 `, +:ref:`Figure 22 ` and +:ref:`Figure 23 `, you will get a somewhat +unexpected (but pleasant) surprise. Roughly, *shuffle* makes the writing +process (shuffling+compressing) faster (approximately a 15% for LZO, 30% for +Bzip2 and a 80% for Zlib), which is an interesting result by itself. +But perhaps more exciting is the fact that the reading process +(unshuffling+decompressing) is also accelerated by a similar extent (a 20% +for LZO, 60% for Zlib and a 75% for Bzip2, roughly). + +.. _comprWriteShuffleComparison: + +.. figure:: images/compressed-writing-shuffle.png + :align: center + + **Figure 21. Writing with different compression libraries with and + without the shuffle filter.** + + +.. _comprReadNoCacheShuffleComparison: + +.. figure:: images/compressed-select-nocache-shuffle-only.png + :align: center + + **Figure 22. Reading with different compression libraries with the + shuffle filter. The file is not in OS cache.** + + + +.. _comprReadCacheShuffleComparison: + +.. figure:: images/compressed-select-cache-shuffle.png + :align: center + + **Figure 23. Reading with different compression libraries with and + without the shuffle filter. The file is in OS cache.** + +You may wonder why introducing another filter in the write/read pipelines +does effectively accelerate the throughput. Well, maybe data elements are +more similar or related column-wise than row-wise, i.e. contiguous elements +in the same column are more alike, so shuffling makes the job of the +compressor easier (faster) and more effective (greater ratios). As a side +effect, compressed chunks do fit better in the CPU cache (at least, the +chunks are smaller!) so that the process of unshuffle/decompress can make a +better use of the cache (i.e. reducing the number of CPU cache faults). + +So, given the potential gains (faster writing and reading, but specially +much improved compression level), it is a good thing to have such a filter +enabled by default in the battle for discovering redundancy when you want to +compress your data, just as PyTables does. + + +Using Psyco +----------- +Psyco (see :ref:`[PSYCO] `) is a kind of specialized compiler for +Python that typically accelerates Python applications with no change in +source code. You can think of Psyco as a kind of just-in-time (JIT) compiler, +a little bit like Java's, that emits machine code on the fly instead of +interpreting your Python program step by step. The result is that your +unmodified Python programs run faster. + +Psyco is very easy to install and use, so in most scenarios it is worth to +give it a try. However, it only runs on Intel 386 architectures, so if you +are using other architectures, you are out of luck (and, moreover, it seems +that there are no plans to support other platforms). Besides, with the +addition of flexible (and very fast) in-kernel queries (by the way, they +cannot be optimized at all by Psyco), the use of Psyco will only help in +rather few scenarios. In fact, the only important situation that you might +benefit right now from using Psyco (I mean, in PyTables contexts) is for +speeding-up the write speed in tables when using the Row interface (see +:ref:`RowClassDescr`). But again, this latter case can also be accelerated +by using the :meth:`Table.append` method and building your own buffers [4]_. + +As an example, imagine that you have a small script that reads and selects +data over a series of datasets, like this:: + + def read_file(filename): + "Select data from all the tables in filename" + fileh = open_file(filename, mode = "r") + result = [] + for table in fileh("/", 'Table'): + result = [p['var3'] for p in table if p['var2'] <= 20] + fileh.close() + return result + + if __name__=="__main__": + print(read_file("myfile.h5")) + +In order to accelerate this piece of code, you can rewrite your main program +to look like:: + + if __name__=="__main__": + import psyco + psyco.bind(read_file) + print(read_file("myfile.h5")) + +That's all! From now on, each time that you execute your Python script, +Psyco will deploy its sophisticated algorithms so as to accelerate your +calculations. + +You can see in the graphs :ref:`Figure 24 ` and +:ref:`Figure 25 ` how much I/O speed improvement you can +get by using Psyco. By looking at this figures you can get an idea if these +improvements are of your interest or not. In general, if you are not going to +use compression you will take advantage of Psyco if your tables are medium +sized (from a thousand to a million rows), and this advantage will disappear +progressively when the number of rows grows well over one million. However if +you use compression, you will probably see improvements even beyond this +limit (see :ref:`compressionIssues`). +As always, there is no substitute for experimentation with your own dataset. + +.. _psycoWriteComparison: + +.. figure:: images/write-medium-psyco-nopsyco-comparison.png + :align: center + + **Figure 24. Writing tables with/without Psyco.** + + +.. _psycoReadComparison: + +.. figure:: images/read-medium-psyco-nopsyco-comparison.png + :align: center + + **Figure 25. Reading tables with/without Psyco.** + + +.. _LRUOptim: + +Getting the most from the node LRU cache +---------------------------------------- +One limitation of the initial versions of PyTables was that they needed to +load all nodes in a file completely before being ready to deal with them, +making the opening times for files with a lot of nodes very high and +unacceptable in many cases. + +Starting from PyTables 1.2 on, a new lazy node loading schema was setup that +avoids loading all the nodes of the *object tree* in memory. In addition, a +new LRU cache was introduced in order to accelerate the access to already +visited nodes. This cache (one per file) is responsible for keeping up the +most recently visited nodes in memory and discard the least recent used ones. +This represents a big advantage over the old schema, not only in terms of +memory usage (as there is no need to load *every* node in memory), but it +also adds very convenient optimizations for working interactively like, for +example, speeding-up the opening times of files with lots of nodes, allowing +to open almost any kind of file in typically less than one tenth of second +(compare this with the more than 10 seconds for files with more than 10000 +nodes in PyTables pre-1.2 era) as well as optimizing the access to frequently +visited nodes. See for more info on the advantages (and also drawbacks) of +this approach. + +One thing that deserves some discussion is the election of the parameter that +sets the maximum amount of nodes to be kept in memory at any time. +As PyTables is meant to be deployed in machines that can have potentially low +memory, the default for it is quite conservative (you can look at its actual +value in the :data:`parameters.NODE_CACHE_SLOTS` parameter in module +:file:`tables/parameters.py`). However, if you usually need to deal with +files that have many more nodes than the maximum default, and you have a lot +of free memory in your system, then you may want to experiment in order to +see which is the appropriate value of :data:`parameters.NODE_CACHE_SLOTS` that +fits better your needs. + +As an example, look at the next code:: + + def browse_tables(filename): + fileh = open_file(filename,'a') + group = fileh.root.newgroup + for j in range(10): + for tt in fileh.walk_nodes(group, "Table"): + title = tt.attrs.TITLE + for row in tt: + pass + fileh.close() + +We will be running the code above against a couple of files having a +``/newgroup`` containing 100 tables and 1000 tables respectively. In addition, +this benchmark is run twice for two different values of the LRU cache size, +specifically 256 and 1024. You can see the results in +:ref:`table `. + +.. _optimization_table_1: + +.. only:: not latex + + .. table:: **Retrieval speed and memory consumption depending on the number of nodes in LRU cache.** + + ====================== =========== === ======= ==== ==== === ======= ==== ==== + Number: 100 nodes 1000 nodes + ---------------------------------- --------------------- --------------------- + Mem & Speed Memory (MB) Time (ms) Memory (MB) Time (ms) + ---------------------------------- ----------- --------- ----------- --------- + Node is coming from... Cache size 256 1024 256 1024 256 1024 256 1024 + ====================== =========== === ======= ==== ==== === ======= ==== ==== + Disk 14 14 1.24 1.24 51 66 1.33 1.31 + Cache 14 14 0.53 0.52 65 73 1.35 0.68 + ====================== =========== === ======= ==== ==== === ======= ==== ==== + +.. raw:: latex + + \begin{threeparttable} + \capstart\caption{Retrieval speed and memory consumption depending on the number of nodes in LRU cache.} + + \begin{tabulary}{\linewidth}{|l|l|r|r|r|r|r|r|r|r|} + \hline + \multicolumn{2}{|l|}{\textbf{Number:}} & \multicolumn{4}{|c|}{\textbf{100 nodes}} & \multicolumn{4}{|c|}{\textbf{1000 nodes}} \\ + \hline + \multicolumn{2}{|l|}{\textbf{Mem and Speed}} & \multicolumn{2}{|c|}{\textbf{Memory (MB)}} & \multicolumn{2}{|c|}{\textbf{Time (ms)}} & \multicolumn{2}{|c|}{\textbf{Memory (MB)}} & \multicolumn{2}{|c|}{\textbf{Time (ms)}}\\ + \hline + \textbf{Node is coming from...} & \textbf{Cache size} & \textbf{256} & \textbf{1024} & \textbf{256} & \textbf{1024} & \textbf{256} & \textbf{1024} & \textbf{256} & \textbf{1024}\\ + \hline + Disk & & 14 & 14 & 1.24 & 1.24 & 51 & 66 & 1.33 & 1.31 \\ + Cache & & 14 & 14 & 0.53 & 0.52 & 65 & 73 & 1.35 & 0.68 \\ + \hline + \end{tabulary} + + \end{threeparttable} + + +From the data in :ref:`table `, one can see that when +the number of objects that you are dealing with does fit in cache, you will +get better access times to them. Also, incrementing the node cache size +effectively consumes more memory *only* if the total nodes exceeds the slots +in cache; otherwise the memory consumption remains the same. It is also worth +noting that incrementing the node cache size in the case you want to fit all +your nodes in cache does not take much more memory than being too +conservative. On the other hand, it might happen that the speed-up that you +can achieve by allocating more slots in your cache is not worth the amount of +memory used. + +Also worth noting is that if you have a lot of memory available and +performance is absolutely critical, you may want to try out a negative value +for :data:`parameters.NODE_CACHE_SLOTS`. This will cause that all the touched +nodes will be kept in an internal dictionary and this is the faster way to +load/retrieve nodes. +However, and in order to avoid a large memory consumption, the user will be +warned when the number of loaded nodes will reach the ``-NODE_CACHE_SLOTS`` +value. + +Finally, a value of zero in :data:`parameters.NODE_CACHE_SLOTS` means that +any cache mechanism is disabled. + +At any rate, if you feel that this issue is important for you, there is no +replacement for setting your own experiments up in order to proceed to +fine-tune the :data:`parameters.NODE_CACHE_SLOTS` parameter. + +.. note:: + + PyTables >= 2.3 sports an optimized LRU cache node written in C, so + you should expect significantly faster LRU cache operations when + working with it. + + +.. note:: + + Numerical results reported in :ref:`table ` have been + obtained with PyTables < 3.1. In PyTables 3.1 the node cache mechanism has + been completely redesigned so while all comments above are still valid, + numerical values could be a little bit different from the ones reported in + :ref:`table `. + + +Compacting your PyTables files +------------------------------ +Let's suppose that you have a file where you have made a lot of row deletions +on one or more tables, or deleted many leaves or even entire subtrees. These +operations might leave *holes* (i.e. space that is not used anymore) in your +files that may potentially affect not only the size of the files but, more +importantly, the performance of I/O. This is because when you delete a lot of +rows in a table, the space is not automatically recovered on the fly. +In addition, if you add many more rows to a table than specified in the +expectedrows keyword at creation time this may affect performance as well, as +explained in :ref:`expectedRowsOptim`. + +In order to cope with these issues, you should be aware that PyTables +includes a handy utility called ptrepack which can be very useful not only to +compact *fragmented* files, but also to adjust some internal parameters in +order to use better buffer and chunk sizes for optimum I/O speed. +Please check the :ref:`ptrepackDescr` for a brief tutorial on its use. + +Another thing that you might want to use ptrepack for is changing the +compression filters or compression levels on your existing data for different +goals, like checking how this can affect both final size and I/O performance, +or getting rid of the optional compressors like LZO or bzip2 in your existing +files, in case you want to use them with generic HDF5 tools that do not have +support for these filters. + +-------------- + +.. [1] CArray nodes, though not + extensible, are chunked and have their optimum chunk size + automatically computed at creation time, since their final shape is known. + +.. [2] Except for Array objects. + +.. [3] Some users reported that the typical improvement with real + data is between a factor 1.5x and 2.5x over the already compressed + datasets. + +.. [4] So, there is not much point in using Psyco + with recent versions of PyTables anymore. + diff --git a/doc/source/usersguide/parameter_files.rst b/doc/source/usersguide/parameter_files.rst new file mode 100644 index 0000000..bd4945e --- /dev/null +++ b/doc/source/usersguide/parameter_files.rst @@ -0,0 +1,159 @@ +.. _parameter_files: + +PyTables parameter files +======================== + +.. currentmodule:: tables.parameters + +PyTables issues warnings when certain limits are exceeded. Those limits are +not intrinsic limitations of the underlying software, but rather are +proactive measures to avoid large resource consumptions. The default limits +should be enough for most of cases, and users should try to respect them. +However, in some situations, it can be convenient to increase (or decrease) +these limits. + +Also, and in order to get maximum performance, PyTables implements a series +of sophisticated features, like I/O buffers or different kind of caches (for +nodes, chunks and other internal metadata). These features comes with a +default set of parameters that ensures a decent performance in most of +situations. But, as there is always a need for every case, it is handy to +have the possibility to fine-tune some of these parameters. + +Because of these reasons, PyTables implements a couple of ways to change the +values of these parameters. All the *tunable* parameters live in the +:file:`tables/parameters.py`. The user can choose to change them in the +parameter files themselves for a global and persistent change. Moreover, if +he wants a finer control, he can pass any of these parameters directly to the +:func:`tables.open_file` function, and the new parameters will only take +effect in the corresponding file (the defaults will continue to be in the +parameter files). + +A description of all of the tunable parameters follows. As the defaults +stated here may change from release to release, please check with your actual +parameter files so as to know your actual default values. + +.. warning:: + + Changing the next parameters may have a very bad effect in the resource + consumption and performance of your PyTables scripts. + + Please be careful when touching these! + + +Tunable parameters in parameters.py +----------------------------------- + +Recommended maximum values +~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autodata:: MAX_COLUMNS + +.. autodata:: MAX_NODE_ATTRS + +.. autodata:: MAX_GROUP_WIDTH + +.. autodata:: MAX_TREE_DEPTH + +.. autodata:: MAX_UNDO_PATH_LENGTH + + +Cache limits +~~~~~~~~~~~~ +.. autodata:: CHUNK_CACHE_NELMTS + +.. autodata:: CHUNK_CACHE_PREEMPT + +.. autodata:: CHUNK_CACHE_SIZE + +.. autodata:: COND_CACHE_SLOTS + +.. autodata:: METADATA_CACHE_SIZE + +.. autodata:: NODE_CACHE_SLOTS + + +Parameters for the different internal caches +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autodata:: BOUNDS_MAX_SIZE + +.. autodata:: BOUNDS_MAX_SLOTS + +.. autodata:: ITERSEQ_MAX_ELEMENTS + +.. autodata:: ITERSEQ_MAX_SIZE + +.. autodata:: ITERSEQ_MAX_SLOTS + +.. autodata:: LIMBOUNDS_MAX_SIZE + +.. autodata:: LIMBOUNDS_MAX_SLOTS + +.. autodata:: TABLE_MAX_SIZE + +.. autodata:: SORTED_MAX_SIZE + +.. autodata:: SORTEDLR_MAX_SIZE + +.. autodata:: SORTEDLR_MAX_SLOTS + + +Parameters for general cache behaviour +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. warning:: + + The next parameters will not take any effect if passed to the open_file() + function, so they can only be changed in a *global* way. You can change + them in the file, but this is strongly discouraged unless you know well + what you are doing. + +.. autodata:: DISABLE_EVERY_CYCLES + +.. autodata:: ENABLE_EVERY_CYCLES + +.. autodata:: LOWEST_HIT_RATIO + + +Parameters for the I/O buffer in Leaf objects +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autodata:: IO_BUFFER_SIZE + +.. autodata:: BUFFER_TIMES + + +Miscellaneous +~~~~~~~~~~~~~ + +.. autodata:: EXPECTED_ROWS_EARRAY + +.. autodata:: EXPECTED_ROWS_TABLE + +.. autodata:: PYTABLES_SYS_ATTRS + +.. autodata:: MAX_NUMEXPR_THREADS + +.. autodata:: MAX_BLOSC_THREADS + +.. autodata:: USER_BLOCK_SIZE + +.. autodata:: ALLOW_PADDING + + +HDF5 driver management +~~~~~~~~~~~~~~~~~~~~~~ + +.. autodata:: DRIVER + +.. autodata:: DRIVER_DIRECT_ALIGNMENT + +.. autodata:: DRIVER_DIRECT_BLOCK_SIZE + +.. autodata:: DRIVER_DIRECT_CBUF_SIZE + +.. autodata:: DRIVER_CORE_INCREMENT + +.. autodata:: DRIVER_CORE_BACKING_STORE + +.. autodata:: DRIVER_CORE_IMAGE + +.. autodata:: DRIVER_SPLIT_META_EXT + +.. autodata:: DRIVER_SPLIT_RAW_EXT diff --git a/doc/source/usersguide/tutorials.rst b/doc/source/usersguide/tutorials.rst new file mode 100644 index 0000000..84c3052 --- /dev/null +++ b/doc/source/usersguide/tutorials.rst @@ -0,0 +1,2328 @@ +Tutorials +========= +.. epigraph:: + + Seràs la clau que obre tots els panys, + seràs la llum, la llum il.limitada, + seràs confí on l'aurora comença, + seràs forment, escala il.luminada! + + -- Lyrics: Vicent Andrés i Estellés. Music: Ovidi Montllor, Toti Soler, M'aclame a tu + + +This chapter consists of a series of simple yet comprehensive +tutorials that will enable you to understand PyTables' main features. If +you would like more information about some particular instance variable, +global function, or method, look at the doc strings or go to the library +reference in :ref:`library_reference`. If you are reading this in PDF or HTML +formats, follow the corresponding hyperlink near each newly introduced +entity. + +Please note that throughout this document the terms *column* and *field* +will be used interchangeably, as will the terms *row* and *record*. + +.. currentmodule:: tables + +Getting started +--------------- +In this section, we will see how to define our own records in Python and save +collections of them (i.e. a *table*) into a file. Then we will select some of +the data in the table using Python cuts and create NumPy arrays to store this +selection as separate objects in a tree. + +In *examples/tutorial1-1.py* you will find the working version of all the +code in this section. Nonetheless, this tutorial series has been written to +allow you reproduce it in a Python interactive console. I encourage you to do +parallel testing and inspect the created objects (variables, docs, children +objects, etc.) during the course of the tutorial! + + +Importing tables objects +~~~~~~~~~~~~~~~~~~~~~~~~ +Before starting you need to import the public objects in the tables package. +You normally do that by executing:: + + >>> import tables + +This is the recommended way to import tables if you don't want to pollute +your namespace. However, PyTables has a contained set of first-level +primitives, so you may consider using the alternative:: + + >>> from tables import * + +If you are going to work with NumPy arrays (and normally, you will) you will +also need to import functions from the numpy package. So most PyTables +programs begin with:: + + >>> import tables # but in this tutorial we use "from tables import \*" + >>> import numpy as np + + +Declaring a Column Descriptor +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Now, imagine that we have a particle detector and we want to create a table +object in order to save data retrieved from it. You need first to define the +table, the number of columns it has, what kind of object is contained in each +column, and so on. + +Our particle detector has a TDC (Time to Digital Converter) counter with a +dynamic range of 8 bits and an ADC (Analogical to Digital Converter) with a +range of 16 bits. For these values, we will define 2 fields in our record +object called TDCcount and ADCcount. We also want to save the grid position +in which the particle has been detected, so we will add two new fields called +grid_i and grid_j. Our instrumentation also can obtain the pressure and +energy of the particle. The resolution of the pressure-gauge allows us to use +a single-precision float to store pressure readings, while the energy value +will need a double-precision float. Finally, to track the particle we want to +assign it a name to identify the kind of the particle it is and a unique +numeric identifier. So we will add two more fields: name will be a string of +up to 16 characters, and idnumber will be an integer of 64 bits (to allow us +to store records for extremely large numbers of particles). + +Having determined our columns and their types, we can now declare a new +Particle class that will contain all this information:: + + >>> from tables import * + >>> class Particle(IsDescription): + ... name = StringCol(16) # 16-character String + ... idnumber = Int64Col() # Signed 64-bit integer + ... ADCcount = UInt16Col() # Unsigned short integer + ... TDCcount = UInt8Col() # unsigned byte + ... grid_i = Int32Col() # 32-bit integer + ... grid_j = Int32Col() # 32-bit integer + ... pressure = Float32Col() # float (single-precision) + ... energy = Float64Col() # double (double-precision) + >>> + +This definition class is self-explanatory. Basically, you declare a class +variable for each field you need. As its value you assign an instance of the +appropriate Col subclass, according to the kind of column defined (the data +type, the length, the shape, etc). See the :ref:`ColClassDescr` for a +complete description of these subclasses. See also :ref:`datatypes` for a +list of data types supported by the Col constructor. + +From now on, we can use Particle instances as a descriptor for our detector +data table. We will see later on how to pass this object to construct the +table. But first, we must create a file where all the actual data pushed into +our table will be saved. + + +Creating a PyTables file from scratch +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Use the top-level :func:`open_file` function to create a PyTables file:: + + >>> h5file = open_file("tutorial1.h5", mode="w", title="Test file") + +:func:`open_file` is one of the objects imported by the +```from tables import *``` statement. Here, we are saying that we want to +create a new file in the current working directory called "tutorial1.h5" in +"w"rite mode and with a descriptive title string ("Test file"). +This function attempts to open the file, and if successful, returns the File +(see :ref:`FileClassDescr`) object instance h5file. The root of the object +tree is specified in the instance's root attribute. + + +Creating a new group +~~~~~~~~~~~~~~~~~~~~ +Now, to better organize our data, we will create a group called *detector* +that branches from the root node. We will save our particle data table in +this group:: + + >>> group = h5file.create_group("/", 'detector', 'Detector information') + +Here, we have taken the File instance h5file and invoked its +:meth:`File.create_group` method to create a new group called *detector* +branching from "*/*" (another way to refer to the h5file.root object we +mentioned above). This will create a new Group (see :ref:`GroupClassDescr`) +object instance that will be assigned to the variable group. + + +Creating a new table +~~~~~~~~~~~~~~~~~~~~ +Let's now create a Table (see :ref:`TableClassDescr`) object as a branch off +the newly-created group. We do that by calling the :meth:`File.create_table` +method of the h5file object:: + + >>> table = h5file.create_table(group, 'readout', Particle, "Readout example") + +We create the Table instance under group. We assign this table the node name +"*readout*". The Particle class declared before is the *description* +parameter (to define the columns of the table) and finally we set +"*Readout example*" as the Table title. With all this information, a new +Table instance is created and assigned to the variable *table*. + +If you are curious about how the object tree looks right now, simply print +the File instance variable *h5file*, and examine the output:: + + >>> print(h5file) + tutorial1.h5 (File) 'Test file' + Last modif.: 'Wed Mar 7 11:06:12 2007' + Object Tree: + / (RootGroup) 'Test file' + /detector (Group) 'Detector information' + /detector/readout (Table(0,)) 'Readout example' + +As you can see, a dump of the object tree is displayed. It's easy to see the +Group and Table objects we have just created. If you want more information, +just type the variable containing the File instance:: + + >>> h5file + File(filename='tutorial1.h5', title='Test file', mode='w', root_uep='/', filters=Filters(complevel=0, shuffle=False, bitshuffle=False, fletcher32=False)) + / (RootGroup) 'Test file' + /detector (Group) 'Detector information' + /detector/readout (Table(0,)) 'Readout example' + description := { + "ADCcount": UInt16Col(shape=(), dflt=0, pos=0), + "TDCcount": UInt8Col(shape=(), dflt=0, pos=1), + "energy": Float64Col(shape=(), dflt=0.0, pos=2), + "grid_i": Int32Col(shape=(), dflt=0, pos=3), + "grid_j": Int32Col(shape=(), dflt=0, pos=4), + "idnumber": Int64Col(shape=(), dflt=0, pos=5), + "name": StringCol(itemsize=16, shape=(), dflt='', pos=6), + "pressure": Float32Col(shape=(), dflt=0.0, pos=7)} + byteorder := 'little' + chunkshape := (87,) + +More detailed information is displayed about each object in the tree. Note +how Particle, our table descriptor class, is printed as part of the *readout* +table description information. In general, you can obtain much more +information about the objects and their children by just printing them. That +introspection capability is very useful, and I recommend that you use it +extensively. + +The time has come to fill this table with some values. First we will get a +pointer to the Row (see :ref:`RowClassDescr`) instance of this table +instance:: + + >>> particle = table.row + +The row attribute of table points to the Row instance that will be used to +write data rows into the table. We write data simply by assigning the Row +instance the values for each row as if it were a dictionary (although it is +actually an *extension class*), using the column names as keys. + +Below is an example of how to write rows:: + + >>> for i in range(10): + ... particle['name'] = f'Particle: {i:6d}' + ... particle['TDCcount'] = i % 256 + ... particle['ADCcount'] = (i * 256) % (1 << 16) + ... particle['grid_i'] = i + ... particle['grid_j'] = 10 - i + ... particle['pressure'] = float(i*i) + ... particle['energy'] = float(particle['pressure'] ** 4) + ... particle['idnumber'] = i * (2 ** 34) + ... # Insert a new particle record + ... particle.append() + >>> + +This code should be easy to understand. The lines inside the loop just assign +values to the different columns in the Row instance particle (see +:ref:`RowClassDescr`). A call to its append() method writes this information +to the table I/O buffer. + +After we have processed all our data, we should flush the table's I/O buffer +if we want to write all this data to disk. We achieve that by calling the +table.flush() method:: + + >>> table.flush() + +Remember, flushing a table is a *very important* step as it will not only +help to maintain the integrity of your file, but also will free valuable +memory resources (i.e. internal buffers) that your program may need for other +things. + + +.. _readingAndSelectingUsage: + +Reading (and selecting) data in a table +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Ok. We have our data on disk, and now we need to access it and select from +specific columns the values we are interested in. See the example below:: + + >>> table = h5file.root.detector.readout + >>> pressure = [x['pressure'] for x in table.iterrows() if x['TDCcount'] > 3 and 20 <= x['pressure'] < 50] + >>> pressure + [25.0, 36.0, 49.0] + +The first line creates a "shortcut" to the *readout* table deeper on the +object tree. As you can see, we use the *natural naming* schema to access it. +We also could have used the h5file.get_node() method, as we will do later on. + +You will recognize the last two lines as a Python list comprehension. +It loops over the rows in *table* as they are provided by the +:meth:`Table.iterrows` iterator. The iterator returns values until all the +data in table is exhausted. These rows are filtered using the expression:: + + x['TDCcount'] > 3 and 20 <= x['pressure'] < 50 + +So, we are selecting the values of the pressure column from filtered records +to create the final list and assign it to pressure variable. + +We could have used a normal for loop to accomplish the same purpose, but I +find comprehension syntax to be more compact and elegant. + +PyTables do offer other, more powerful ways of performing selections which +may be more suitable if you have very large tables or if you need very high +query speeds. They are called *in-kernel* and *indexed* queries, and you can +use them through :meth:`Table.where` and other related methods. + +Let's use an in-kernel selection to query the name column for the same set of +cuts:: + + >>> names = [ x['name'] for x in table.where("""(TDCcount > 3) & (20 <= pressure) & (pressure < 50)""") ] + >>> names + ['Particle: 5', 'Particle: 6', 'Particle: 7'] + +In-kernel and indexed queries are not only much faster, but as you can see, +they also look more compact, and are among the greatest features for +PyTables, so be sure that you use them a lot. See :ref:`condition_syntax` and +:ref:`searchOptim` for more information on in-kernel and indexed selections. + +.. note:: + + A special care should be taken when the query condition includes + string literals. Indeed Python 2 string literals are string of + bytes while Python 3 strings are unicode objects. + + With reference to the above definition of :class:`Particle` it has to be + noted that the type of the "name" column do not change depending on the + Python version used (of course). + It always corresponds to strings of bytes. + + Any condition involving the "name" column should be written using the + appropriate type for string literals in order to avoid + :exc:`TypeError`\ s. + + Suppose one wants to get rows corresponding to specific particle names. + + The code below will work fine in Python 2 but will fail with a + :exc:`TypeError` in Python 3:: + + >>> condition = '(name == "Particle: 5") | (name == "Particle: 7")' + >>> for record in table.where(condition): # TypeError in Python3 + ... # do something with "record" + + The reason is that in Python 3 "condition" implies a comparison + between a string of bytes ("name" column contents) and an unicode + literals. + + The correct way to write the condition is:: + + >>> condition = '(name == b"Particle: 5") | (name == b"Particle: 7")' + +That's enough about selections for now. The next section will show you how to +save these selected results to a file. + + +Creating new array objects +~~~~~~~~~~~~~~~~~~~~~~~~~~ +In order to separate the selected data from the mass of detector data, we +will create a new group columns branching off the root group. Afterwards, +under this group, we will create two arrays that will contain the selected +data. First, we create the group:: + + >>> gcolumns = h5file.create_group(h5file.root, "columns", "Pressure and Name") + +Note that this time we have specified the first parameter using *natural +naming* (h5file.root) instead of with an absolute path string ("/"). + +Now, create the first of the two Array objects we've just mentioned:: + + >>> h5file.create_array(gcolumns, 'pressure', np.array(pressure), "Pressure column selection") + /columns/pressure (Array(3,)) 'Pressure column selection' + atom := Float64Atom(shape=(), dflt=0.0) + maindim := 0 + flavor := 'numpy' + byteorder := 'little' + chunkshape := None + +We already know the first two parameters of the :meth:`File.create_array` +methods (these are the same as the first two in create_table): they are the +parent group *where* Array will be created and the Array instance *name*. +The third parameter is the *object* we want to save to disk. In this case, it +is a NumPy array that is built from the selection list we created before. +The fourth parameter is the *title*. + +Now, we will save the second array. It contains the list of strings we +selected before: we save this object as-is, with no further conversion:: + + >>> h5file.create_array(gcolumns, 'name', names, "Name column selection") + /columns/name (Array(3,)) 'Name column selection' + atom := StringAtom(itemsize=16, shape=(), dflt='') + maindim := 0 + flavor := 'python' + byteorder := 'irrelevant' + chunkshape := None + +As you can see, :meth:`File.create_array` accepts *names* (which is a regular +Python list) as an *object* parameter. Actually, it accepts a variety of +different regular objects (see :func:`create_array`) as parameters. The flavor +attribute (see the output above) saves the original kind of object that was +saved. Based on this *flavor*, PyTables will be able to retrieve exactly the +same object from disk later on. + +Note that in these examples, the create_array method returns an Array instance +that is not assigned to any variable. Don't worry, this is intentional to +show the kind of object we have created by displaying its representation. The +Array objects have been attached to the object tree and saved to disk, as you +can see if you print the complete object tree:: + + >>> print(h5file) + tutorial1.h5 (File) 'Test file' + Last modif.: 'Wed Mar 7 19:40:44 2007' + Object Tree: + / (RootGroup) 'Test file' + /columns (Group) 'Pressure and Name' + /columns/name (Array(3,)) 'Name column selection' + /columns/pressure (Array(3,)) 'Pressure column selection' + /detector (Group) 'Detector information' + /detector/readout (Table(10,)) 'Readout example' + + +Closing the file and looking at its content +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +To finish this first tutorial, we use the close method of the h5file File +object to close the file before exiting Python:: + + >>> h5file.close() + >>> ^D + $ + +You have now created your first PyTables file with a table and two arrays. +You can examine it with any generic HDF5 tool, such as h5dump or h5ls. Here is +what the tutorial1.h5 looks like when read with the h5ls program. + +.. code-block:: bash + + $ h5ls -rd tutorial1.h5 + /columns Group + /columns/name Dataset {3} + Data: + (0) "Particle: 5", "Particle: 6", "Particle: 7" + /columns/pressure Dataset {3} + Data: + (0) 25, 36, 49 + /detector Group + /detector/readout Dataset {10/Inf} + Data: + (0) {0, 0, 0, 0, 10, 0, "Particle: 0", 0}, + (1) {256, 1, 1, 1, 9, 17179869184, "Particle: 1", 1}, + (2) {512, 2, 256, 2, 8, 34359738368, "Particle: 2", 4}, + (3) {768, 3, 6561, 3, 7, 51539607552, "Particle: 3", 9}, + (4) {1024, 4, 65536, 4, 6, 68719476736, "Particle: 4", 16}, + (5) {1280, 5, 390625, 5, 5, 85899345920, "Particle: 5", 25}, + (6) {1536, 6, 1679616, 6, 4, 103079215104, "Particle: 6", 36}, + (7) {1792, 7, 5764801, 7, 3, 120259084288, "Particle: 7", 49}, + (8) {2048, 8, 16777216, 8, 2, 137438953472, "Particle: 8", 64}, + (9) {2304, 9, 43046721, 9, 1, 154618822656, "Particle: 9", 81} + +Here's the output as displayed by the "ptdump" PyTables utility (located in +utils/ directory). + +.. code-block:: bash + + $ ptdump tutorial1.h5 + / (RootGroup) 'Test file' + /columns (Group) 'Pressure and Name' + /columns/name (Array(3,)) 'Name column selection' + /columns/pressure (Array(3,)) 'Pressure column selection' + /detector (Group) 'Detector information' + /detector/readout (Table(10,)) 'Readout example' + +You can pass the `-v` or `-d` options to ptdump if you want +more verbosity. Try them out! + +Also, in :ref:`Figure 1 `, you can admire how the +tutorial1.h5 looks like using the `ViTables `_ graphical +interface. + +.. _tutorial1-1-tableview: + +.. figure:: images/tutorial1-1-tableview.png + :align: center + + **Figure 1. The initial version of the data file for tutorial 1, with a + view of the data objects.** + + +Browsing the *object tree* +-------------------------- +In this section, we will learn how to browse the tree and retrieve data and +also meta-information about the actual data. + +In *examples/tutorial1-2.py* you will find the working version of all the +code in this section. As before, you are encouraged to use a python shell and +inspect the object tree during the course of the tutorial. + + +Traversing the object tree +~~~~~~~~~~~~~~~~~~~~~~~~~~ +Let's start by opening the file we created in last tutorial section:: + + >>> h5file = open_file("tutorial1.h5", "a") + +This time, we have opened the file in "a"ppend mode. We use this mode to add +more information to the file. + +PyTables, following the Python tradition, offers powerful introspection +capabilities, i.e. you can easily ask information about any component of the +object tree as well as search the tree. + +To start with, you can get a preliminary overview of the object tree by +simply printing the existing File instance:: + + >>> print(h5file) + tutorial1.h5 (File) 'Test file' + Last modif.: 'Wed Mar 7 19:50:57 2007' + Object Tree: + / (RootGroup) 'Test file' + /columns (Group) 'Pressure and Name' + /columns/name (Array(3,)) 'Name column selection' + /columns/pressure (Array(3,)) 'Pressure column selection' + /detector (Group) 'Detector information' + /detector/readout (Table(10,)) 'Readout example' + +It looks like all of our objects are there. Now let's make use of the File +iterator to see how to list all the nodes in the object tree:: + + >>> for node in h5file: + ... print(node) + / (RootGroup) 'Test file' + /columns (Group) 'Pressure and Name' + /detector (Group) 'Detector information' + /columns/name (Array(3,)) 'Name column selection' + /columns/pressure (Array(3,)) 'Pressure column selection' + /detector/readout (Table(10,)) 'Readout example' + +We can use the :meth:`File.walk_groups` method of the File class to list only +the *groups* on tree:: + + >>> for group in h5file.walk_groups(): + ... print(group) + / (RootGroup) 'Test file' + /columns (Group) 'Pressure and Name' + /detector (Group) 'Detector information' + +Note that :meth:`File.walk_groups` actually returns an *iterator*, not a list +of objects. Using this iterator with the list_nodes() method is a powerful +combination. Let's see an example listing of all the arrays in the tree:: + + >>> for group in h5file.walk_groups("/"): + ... for array in h5file.list_nodes(group, classname='Array'): + ... print(array) + /columns/name (Array(3,)) 'Name column selection' + /columns/pressure (Array(3,)) 'Pressure column selection' + +:meth:`File.list_nodes` returns a list containing all the nodes hanging off a +specific Group. If the *classname* keyword is specified, the method will +filter out all instances which are not descendants of the class. We have +asked for only Array instances. There exist also an iterator counterpart +called :meth:`File.iter_nodes` that might be handy is some situations, like +for example when dealing with groups with a large number of nodes behind it. + +We can combine both calls by using the :meth:`File.walk_nodes` special method +of the File object. For example:: + + >>> for array in h5file.walk_nodes("/", "Array"): + ... print(array) + /columns/name (Array(3,)) 'Name column selection' + /columns/pressure (Array(3,)) 'Pressure column selection' + +This is a nice shortcut when working interactively. + +Finally, we will list all the Leaf, i.e. Table and Array instances (see +:ref:`LeafClassDescr` for detailed information on Leaf class), in the +/detector group. Note that only one instance of the Table class (i.e. +readout) will be selected in this group (as should be the case):: + + >>> for leaf in h5file.root.detector._f_walknodes('Leaf'): + ... print(leaf) + /detector/readout (Table(10,)) 'Readout example' + +We have used a call to the :meth:`Group._f_walknodes` method, using the +*natural naming* path specification. + +Of course you can do more sophisticated node selections using these powerful +methods. But first, let's take a look at some important PyTables object +instance variables. + + +Setting and getting user attributes +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +PyTables provides an easy and concise way to complement the meaning of your +node objects on the tree by using the AttributeSet class (see +:ref:`AttributeSetClassDescr`). You can access this object through the +standard attribute attrs in Leaf nodes and _v_attrs in Group nodes. + +For example, let's imagine that we want to save the date indicating when the +data in /detector/readout table has been acquired, as well as the temperature +during the gathering process:: + + >>> table = h5file.root.detector.readout + >>> table.attrs.gath_date = "Wed, 06/12/2003 18:33" + >>> table.attrs.temperature = 18.4 + >>> table.attrs.temp_scale = "Celsius" + +Now, let's set a somewhat more complex attribute in the /detector group:: + + >>> detector = h5file.root.detector + >>> detector._v_attrs.stuff = [5, (2.3, 4.5), "Integer and tuple"] + +Note how the AttributeSet instance is accessed with the _v_attrs attribute +because detector is a Group node. In general, you can save any standard +Python data structure as an attribute node. See :ref:`AttributeSetClassDescr` +for a more detailed explanation of how they are serialized for export to +disk. + +Retrieving the attributes is equally simple:: + + >>> table.attrs.gath_date + 'Wed, 06/12/2003 18:33' + >>> table.attrs.temperature + 18.399999999999999 + >>> table.attrs.temp_scale + 'Celsius' + >>> detector._v_attrs.stuff + [5, (2.2999999999999998, 4.5), 'Integer and tuple'] + +You can probably guess how to delete attributes:: + + >>> del table.attrs.gath_date + +If you want to examine the current user attribute set of /detector/table, you +can print its representation (try hitting the TAB key twice if you are on a +Unix Python console with the rlcompleter module active):: + + >>> table.attrs + /detector/readout._v_attrs (AttributeSet), 23 attributes: + [CLASS := 'TABLE', + FIELD_0_FILL := 0, + FIELD_0_NAME := 'ADCcount', + FIELD_1_FILL := 0, + FIELD_1_NAME := 'TDCcount', + FIELD_2_FILL := 0.0, + FIELD_2_NAME := 'energy', + FIELD_3_FILL := 0, + FIELD_3_NAME := 'grid_i', + FIELD_4_FILL := 0, + FIELD_4_NAME := 'grid_j', + FIELD_5_FILL := 0, + FIELD_5_NAME := 'idnumber', + FIELD_6_FILL := '', + FIELD_6_NAME := 'name', + FIELD_7_FILL := 0.0, + FIELD_7_NAME := 'pressure', + FLAVOR := 'numpy', + NROWS := 10, + TITLE := 'Readout example', + VERSION := '2.6', + temp_scale := 'Celsius', + temperature := 18.399999999999999] + +We've got all the attributes (including the *system* attributes). You can get +a list of *all* attributes or only the *user* or *system* attributes with the +_f_list() method:: + + >>> print(table.attrs._f_list("all")) + ['CLASS', 'FIELD_0_FILL', 'FIELD_0_NAME', 'FIELD_1_FILL', 'FIELD_1_NAME', + 'FIELD_2_FILL', 'FIELD_2_NAME', 'FIELD_3_FILL', 'FIELD_3_NAME', 'FIELD_4_FILL', + 'FIELD_4_NAME', 'FIELD_5_FILL', 'FIELD_5_NAME', 'FIELD_6_FILL', 'FIELD_6_NAME', + 'FIELD_7_FILL', 'FIELD_7_NAME', 'FLAVOR', 'NROWS', 'TITLE', 'VERSION', + 'temp_scale', 'temperature'] + >>> print(table.attrs._f_list("user")) + ['temp_scale', 'temperature'] + >>> print(table.attrs._f_list("sys")) + ['CLASS', 'FIELD_0_FILL', 'FIELD_0_NAME', 'FIELD_1_FILL', 'FIELD_1_NAME', + 'FIELD_2_FILL', 'FIELD_2_NAME', 'FIELD_3_FILL', 'FIELD_3_NAME', 'FIELD_4_FILL', + 'FIELD_4_NAME', 'FIELD_5_FILL', 'FIELD_5_NAME', 'FIELD_6_FILL', 'FIELD_6_NAME', + 'FIELD_7_FILL', 'FIELD_7_NAME', 'FLAVOR', 'NROWS', 'TITLE', 'VERSION'] + +You can also rename attributes:: + + >>> table.attrs._f_rename("temp_scale","tempScale") + >>> print(table.attrs._f_list()) + ['tempScale', 'temperature'] + +And, from PyTables 2.0 on, you are allowed also to set, delete or rename +system attributes:: + + >>> table.attrs._f_rename("VERSION", "version") + >>> table.attrs.VERSION + Traceback (most recent call last): + File "", line 1, in + File "tables/attributeset.py", line 222, in __getattr__ + (name, self._v__nodepath) + AttributeError: Attribute 'VERSION' does not exist in node: '/detector/readout' + >>> table.attrs.version + '2.6' + +*Caveat emptor:* you must be careful when modifying system attributes because +you may end fooling PyTables and ultimately getting unwanted behaviour. Use +this only if you know what are you doing. + +So, given the caveat above, we will proceed to restore the original name of +VERSION attribute:: + + >>> table.attrs._f_rename("version", "VERSION") + >>> table.attrs.VERSION + '2.6' + +Ok, that's better. If you would terminate your session now, you would be able +to use the h5ls command to read the /detector/readout attributes from the +file written to disk. + +.. code-block:: bash + + $ h5ls -vr tutorial1.h5/detector/readout + Opened "tutorial1.h5" with sec2 driver. + /detector/readout Dataset {10/Inf} + Attribute: CLASS scalar + Type: 6-byte null-terminated ASCII string + Data: "TABLE" + Attribute: VERSION scalar + Type: 4-byte null-terminated ASCII string + Data: "2.6" + Attribute: TITLE scalar + Type: 16-byte null-terminated ASCII string + Data: "Readout example" + Attribute: NROWS scalar + Type: native long long + Data: 10 + Attribute: FIELD_0_NAME scalar + Type: 9-byte null-terminated ASCII string + Data: "ADCcount" + Attribute: FIELD_1_NAME scalar + Type: 9-byte null-terminated ASCII string + Data: "TDCcount" + Attribute: FIELD_2_NAME scalar + Type: 7-byte null-terminated ASCII string + Data: "energy" + Attribute: FIELD_3_NAME scalar + Type: 7-byte null-terminated ASCII string + Data: "grid_i" + Attribute: FIELD_4_NAME scalar + Type: 7-byte null-terminated ASCII string + Data: "grid_j" + Attribute: FIELD_5_NAME scalar + Type: 9-byte null-terminated ASCII string + Data: "idnumber" + Attribute: FIELD_6_NAME scalar + Type: 5-byte null-terminated ASCII string + Data: "name" + Attribute: FIELD_7_NAME scalar + Type: 9-byte null-terminated ASCII string + Data: "pressure" + Attribute: FLAVOR scalar + Type: 5-byte null-terminated ASCII string + Data: "numpy" + Attribute: tempScale scalar + Type: 7-byte null-terminated ASCII string + Data: "Celsius" + Attribute: temperature scalar + Type: native double + Data: 18.4 + Location: 0:1:0:1952 + Links: 1 + Modified: 2006-12-11 10:35:13 CET + Chunks: {85} 3995 bytes + Storage: 470 logical bytes, 3995 allocated bytes, 11.76% utilization + Type: struct { + "ADCcount" +0 native unsigned short + "TDCcount" +2 native unsigned char + "energy" +3 native double + "grid_i" +11 native int + "grid_j" +15 native int + "idnumber" +19 native long long + "name" +27 16-byte null-terminated ASCII string + "pressure" +43 native float + } 47 bytes + +Attributes are a useful mechanism to add persistent (meta) information to +your data. + + +Getting object metadata +~~~~~~~~~~~~~~~~~~~~~~~ +Each object in PyTables has *metadata* information about the data in the +file. Normally this *meta-information* is accessible through the node +instance variables. Let's take a look at some examples:: + + >>> print("Object:", table) + Object: /detector/readout (Table(10,)) 'Readout example' + >>> print("Table name:", table.name) + Table name: readout + >>> print("Table title:", table.title) + Table title: Readout example + >>> print("Number of rows in table:", table.nrows) + Number of rows in table: 10 + >>> print("Table variable names with their type and shape:") + Table variable names with their type and shape: + >>> for name in table.colnames: + ... print(name, ':= %s, %s' % (table.coldtypes[name], table.coldtypes[name].shape)) + ADCcount := uint16, () + TDCcount := uint8, () + energy := float64, () + grid_i := int32, () + grid_j := int32, () + idnumber := int64, () + name := |S16, () + pressure := float32, () + +Here, the name, title, nrows, colnames and coldtypes attributes (see +:class:`Table` for a complete attribute list) of the Table object gives us +quite a bit of information about the table data. + +You can interactively retrieve general information about the public objects +in PyTables by asking for help:: + + >>> help(table) + Help on Table in module tables.table: + class Table(tableextension.Table, tables.leaf.Leaf) + | This class represents heterogeneous datasets in an HDF5 file. + | + | Tables are leaves (see the `Leaf` class) whose data consists of a + | unidimensional sequence of *rows*, where each row contains one or + | more *fields*. Fields have an associated unique *name* and + | *position*, with the first field having position 0. All rows have + | the same fields, which are arranged in *columns*. + [snip] + | + | Instance variables + | ------------------ + | + | The following instance variables are provided in addition to those + | in `Leaf`. Please note that there are several `col` dictionaries + | to ease retrieving information about a column directly by its path + | name, avoiding the need to walk through `Table.description` or + | `Table.cols`. + | + | autoindex + | Automatically keep column indexes up to date? + | + | Setting this value states whether existing indexes should be + | automatically updated after an append operation or recomputed + | after an index-invalidating operation (i.e. removal and + | modification of rows). The default is true. + [snip] + | rowsize + | The size in bytes of each row in the table. + | + | Public methods -- reading + | ------------------------- + | + | * col(name) + | * iterrows([start][, stop][, step]) + | * itersequence(sequence) + * itersorted(sortby[, checkCSI][, start][, stop][, step]) + | * read([start][, stop][, step][, field][, coords]) + | * read_coordinates(coords[, field]) + * read_sorted(sortby[, checkCSI][, field,][, start][, stop][, step]) + | * __getitem__(key) + | * __iter__() + | + | Public methods -- writing + | ------------------------- + | + | * append(rows) + | * modify_column([start][, stop][, step][, column][, colname]) + [snip] + +Try getting help with other object docs by yourself:: + + >>> help(h5file) + >>> help(table.remove_rows) + +To examine metadata in the */columns/pressure* Array object:: + + >>> pressureObject = h5file.get_node("/columns", "pressure") + >>> print("Info on the object:", repr(pressureObject)) + Info on the object: /columns/pressure (Array(3,)) 'Pressure column selection' + atom := Float64Atom(shape=(), dflt=0.0) + maindim := 0 + flavor := 'numpy' + byteorder := 'little' + chunkshape := None + >>> print(" shape: ==>", pressureObject.shape) + shape: ==> (3,) + >>> print(" title: ==>", pressureObject.title) + title: ==> Pressure column selection + >>> print(" atom: ==>", pressureObject.atom) + atom: ==> Float64Atom(shape=(), dflt=0.0) + +Observe that we have used the :meth:`File.get_node` method of the File class +to access a node in the tree, instead of the natural naming method. Both are +useful, and depending on the context you will prefer one or the other. +:meth:`File.get_node` has the advantage that it can get a node from the +pathname string (as in this example) and can also act as a filter to show +only nodes in a particular location that are instances of class *classname*. +In general, however, I consider natural naming to be more elegant and easier +to use, especially if you are using the name completion capability present in +interactive console. Try this powerful combination of natural naming and +completion capabilities present in most Python consoles, and see how pleasant +it is to browse the object tree (well, as pleasant as such an activity can +be). + +If you look at the type attribute of the pressureObject object, you can +verify that it is a "*float64*" array. By looking at its shape attribute, you +can deduce that the array on disk is unidimensional and has 3 elements. +See :class:`Array` or the internal doc strings for the complete Array +attribute list. + + +Reading data from Array objects +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Once you have found the desired Array, use the read() method of the Array +object to retrieve its data:: + + >>> pressureArray = pressureObject.read() + >>> pressureArray + array([ 25., 36., 49.]) + >>> print("pressureArray is an object of type:", type(pressureArray)) + pressureArray is an object of type: + >>> nameArray = h5file.root.columns.name.read() + >>> print("nameArray is an object of type:", type(nameArray)) + nameArray is an object of type: + >>> + >>> print("Data on arrays nameArray and pressureArray:") + Data on arrays nameArray and pressureArray: + >>> for i in range(pressureObject.shape[0]): + ... print(nameArray[i], "-->", pressureArray[i]) + Particle: 5 --> 25.0 + Particle: 6 --> 36.0 + Particle: 7 --> 49.0 + +You can see that the :meth:`Array.read` method returns an authentic NumPy +object for the pressureObject instance by looking at the output of the type() +call. A read() of the nameArray object instance returns a native Python list +(of strings). The type of the object saved is stored as an HDF5 attribute +(named FLAVOR) for objects on disk. This attribute is then read as Array +meta-information (accessible through in the Array.attrs.FLAVOR variable), +enabling the read array to be converted into the original object. This +provides a means to save a large variety of objects as arrays with the +guarantee that you will be able to later recover them in their original form. +See :meth:`File.create_array` for a complete list of supported objects for the +Array object class. + + +Commiting data to tables and arrays +----------------------------------- +We have seen how to create tables and arrays and how to browse both data and +metadata in the object tree. Let's examine more closely now one of the most +powerful capabilities of PyTables, namely, how to modify already created +tables and arrays [1]_ + + +Appending data to an existing table +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Now, let's have a look at how we can add records to an existing table on +disk. Let's use our well-known *readout* Table object and append some new +values to it:: + + >>> table = h5file.root.detector.readout + >>> particle = table.row + >>> for i in range(10, 15): + ... particle['name'] = f'Particle: {i:6d}' + ... particle['TDCcount'] = i % 256 + ... particle['ADCcount'] = (i * 256) % (1 << 16) + ... particle['grid_i'] = i + ... particle['grid_j'] = 10 - i + ... particle['pressure'] = float(i*i) + ... particle['energy'] = float(particle['pressure'] ** 4) + ... particle['idnumber'] = i * (2 ** 34) + ... particle.append() + >>> table.flush() + +It's the same method we used to fill a new table. PyTables knows that this +table is on disk, and when you add new records, they are appended to the end +of the table [2]_. + +If you look carefully at the code you will see that we have used the +table.row attribute to create a table row and fill it with the new values. +Each time that its append() method is called, the actual row is committed to +the output buffer and the row pointer is incremented to point to the next +table record. When the buffer is full, the data is saved on disk, and the +buffer is reused again for the next cycle. + +*Caveat emptor*: Do not forget to always call the flush() method after a +write operation, or else your tables will not be updated! + +Let's have a look at some rows in the modified table and verify that our new +data has been appended:: + + >>> for r in table.iterrows(): + ... print("%-16s | %11.1f | %11.4g | %6d | %6d | %8d \|" % \\ + ... (r['name'], r['pressure'], r['energy'], r['grid_i'], r['grid_j'], + ... r['TDCcount'])) + Particle: 0 | 0.0 | 0 | 0 | 10 | 0 | + Particle: 1 | 1.0 | 1 | 1 | 9 | 1 | + Particle: 2 | 4.0 | 256 | 2 | 8 | 2 | + Particle: 3 | 9.0 | 6561 | 3 | 7 | 3 | + Particle: 4 | 16.0 | 6.554e+04 | 4 | 6 | 4 | + Particle: 5 | 25.0 | 3.906e+05 | 5 | 5 | 5 | + Particle: 6 | 36.0 | 1.68e+06 | 6 | 4 | 6 | + Particle: 7 | 49.0 | 5.765e+06 | 7 | 3 | 7 | + Particle: 8 | 64.0 | 1.678e+07 | 8 | 2 | 8 | + Particle: 9 | 81.0 | 4.305e+07 | 9 | 1 | 9 | + Particle: 10 | 100.0 | 1e+08 | 10 | 0 | 10 | + Particle: 11 | 121.0 | 2.144e+08 | 11 | -1 | 11 | + Particle: 12 | 144.0 | 4.3e+08 | 12 | -2 | 12 | + Particle: 13 | 169.0 | 8.157e+08 | 13 | -3 | 13 | + Particle: 14 | 196.0 | 1.476e+09 | 14 | -4 | 14 | + + +Modifying data in tables +~~~~~~~~~~~~~~~~~~~~~~~~ +Ok, until now, we've been only reading and writing (appending) values to our +tables. But there are times that you need to modify your data once you have +saved it on disk (this is specially true when you need to modify the real +world data to adapt your goals ;). +Let's see how we can modify the values that were saved in our existing tables. +We will start modifying single cells in the first row of the Particle table:: + + >>> print("Before modif-->", table[0]) + Before modif--> (0, 0, 0.0, 0, 10, 0L, 'Particle: 0', 0.0) + >>> table.cols.TDCcount[0] = 1 + >>> print("After modifying first row of ADCcount-->", table[0]) + After modifying first row of ADCcount--> (0, 1, 0.0, 0, 10, 0L, 'Particle: 0', 0.0) + >>> table.cols.energy[0] = 2 + >>> print("After modifying first row of energy-->", table[0]) + After modifying first row of energy--> (0, 1, 2.0, 0, 10, 0L, 'Particle: 0', 0.0) + +We can modify complete ranges of columns as well:: + + >>> table.cols.TDCcount[2:5] = [2,3,4] + >>> print("After modifying slice [2:5] of TDCcount-->", table[0:5]) + After modifying slice [2:5] of TDCcount--> + [(0, 1, 2.0, 0, 10, 0L, 'Particle: 0', 0.0) + (256, 1, 1.0, 1, 9, 17179869184L, 'Particle: 1', 1.0) + (512, 2, 256.0, 2, 8, 34359738368L, 'Particle: 2', 4.0) + (768, 3, 6561.0, 3, 7, 51539607552L, 'Particle: 3', 9.0) + (1024, 4, 65536.0, 4, 6, 68719476736L, 'Particle: 4', 16.0)] + >>> table.cols.energy[1:9:3] = [2,3,4] + >>> print("After modifying slice [1:9:3] of energy-->", table[0:9]) + After modifying slice [1:9:3] of energy--> + [(0, 1, 2.0, 0, 10, 0L, 'Particle: 0', 0.0) + (256, 1, 2.0, 1, 9, 17179869184L, 'Particle: 1', 1.0) + (512, 2, 256.0, 2, 8, 34359738368L, 'Particle: 2', 4.0) + (768, 3, 6561.0, 3, 7, 51539607552L, 'Particle: 3', 9.0) + (1024, 4, 3.0, 4, 6, 68719476736L, 'Particle: 4', 16.0) + (1280, 5, 390625.0, 5, 5, 85899345920L, 'Particle: 5', 25.0) + (1536, 6, 1679616.0, 6, 4, 103079215104L, 'Particle: 6', 36.0) + (1792, 7, 4.0, 7, 3, 120259084288L, 'Particle: 7', 49.0) + (2048, 8, 16777216.0, 8, 2, 137438953472L, 'Particle: 8', 64.0)] + +Check that the values have been correctly modified! + +.. hint:: + + remember that column TDCcount is the second one, and that energy is the + third. Look for more info on modifying columns in + :meth:`Column.__setitem__`. + +PyTables also lets you modify complete sets of rows at the same time. As a +demonstration of these capability, see the next example:: + + >>> table.modify_rows(start=1, step=3, + ... rows=[(1, 2, 3.0, 4, 5, 6L, 'Particle: None', 8.0), + ... (2, 4, 6.0, 8, 10, 12L, 'Particle: None*2', 16.0)]) + 2 + >>> print("After modifying the complete third row-->", table[0:5]) + After modifying the complete third row--> + [(0, 1, 2.0, 0, 10, 0L, 'Particle: 0', 0.0) + (1, 2, 3.0, 4, 5, 6L, 'Particle: None', 8.0) + (512, 2, 256.0, 2, 8, 34359738368L, 'Particle: 2', 4.0) + (768, 3, 6561.0, 3, 7, 51539607552L, 'Particle: 3', 9.0) + (2, 4, 6.0, 8, 10, 12L, 'Particle: None*2', 16.0)] + +As you can see, the modify_rows() call has modified the rows second and fifth, +and it returned the number of modified rows. + +Apart of :meth:`Table.modify_rows`, there exists another method, called +:meth:`Table.modify_column` to modify specific columns as well. + +Finally, there is another way of modifying tables that is generally more +handy than the one described above. This new way uses the :meth:`Row.update` method of +the Row instance that is attached to every table, so it +is meant to be used in table iterators. Take a look at the following example:: + + >>> for row in table.where('TDCcount <= 2'): + ... row['energy'] = row['TDCcount']*2 + ... row.update() + >>> print("After modifying energy column (where TDCcount <=2)-->", table[0:4]) + After modifying energy column (where TDCcount <=2)--> + [(0, 1, 2.0, 0, 10, 0L, 'Particle: 0', 0.0) + (1, 2, 4.0, 4, 5, 6L, 'Particle: None', 8.0) + (512, 2, 4.0, 2, 8, 34359738368L, 'Particle: 2', 4.0) + (768, 3, 6561.0, 3, 7, 51539607552L, 'Particle: 3', 9.0)] + +.. note:: + + The authors find this way of updating tables (i.e. using Row.update()) + to be both convenient and efficient. Please make sure to use it + extensively. + +*Caveat emptor*: Currently, :meth:`Row.update` will not work (the table will +not be updated) if the loop is broken with ``break`` statement. A possible +workaround consists in manually flushing the row internal buffer by calling +``row._flushModRows()`` just before the ``break`` statement. + +Modifying data in arrays +~~~~~~~~~~~~~~~~~~~~~~~~ + +We are now going to see how to modify data in array objects. +The basic way to do this is through the use of :meth:`Array.__setitem__` +special method. Let's see how to modify data on the pressureObject array:: + + >>> pressureObject = h5file.root.columns.pressure + >>> print("Before modif-->", pressureObject[:]) + Before modif--> [ 25. 36. 49.] + >>> pressureObject[0] = 2 + >>> print("First modif-->", pressureObject[:]) + First modif--> [ 2. 36. 49.] + >>> pressureObject[1:3] = [2.1, 3.5] + >>> print("Second modif-->", pressureObject[:]) + Second modif--> [ 2. 2.1 3.5] + >>> pressureObject[::2] = [1,2] + >>> print("Third modif-->", pressureObject[:]) + Third modif--> [ 1. 2.1 2. ] + +So, in general, you can use any combination of (multidimensional) extended +slicing. + +With the sole exception that you cannot use negative values for step to refer +to indexes that you want to modify. See :meth:`Array.__getitem__` for more +examples on how to use extended slicing in PyTables objects. + +Similarly, with an array of strings:: + + >>> nameObject = h5file.root.columns.name + >>> print("Before modif-->", nameObject[:]) + Before modif--> ['Particle: 5', 'Particle: 6', 'Particle: 7'] + >>> nameObject[0] = 'Particle: None' + >>> print("First modif-->", nameObject[:]) + First modif--> ['Particle: None', 'Particle: 6', 'Particle: 7'] + >>> nameObject[1:3] = ['Particle: 0', 'Particle: 1'] + >>> print("Second modif-->", nameObject[:]) + Second modif--> ['Particle: None', 'Particle: 0', 'Particle: 1'] + >>> nameObject[::2] = ['Particle: -3', 'Particle: -5'] + >>> print("Third modif-->", nameObject[:]) + Third modif--> ['Particle: -3', 'Particle: 0', 'Particle: -5'] + + +And finally... how to delete rows from a table +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +We'll finish this tutorial by deleting some rows from the table we have. +Suppose that we want to delete the 5th to 9th rows (inclusive):: + + >>> table.remove_rows(5,10) + 5 + +:meth:`Table.remove_rows` deletes the rows in the range (start, stop). It +returns the number of rows effectively removed. + +We have reached the end of this first tutorial. Don't forget to close the +file when you finish:: + + >>> h5file.close() + >>> ^D + $ + +In :ref:`Figure 2 ` you can see a graphical view of the +PyTables file with the datasets we have just created. In +:ref:`tutorial1-general` are displayed the general properties of the table +/detector/readout. + +.. _tutorial1-2-tableview: + +.. figure:: images/tutorial1-2-tableview.png + :align: center + + **Figure 2. The final version of the data file for tutorial 1.** + + +.. _tutorial1-general: + +.. figure:: images/tutorial1-general.png + :align: center + + **Figure 3. General properties of the /detector/readout table.** + + +.. _secondExample: + +Multidimensional table cells and automatic sanity checks +-------------------------------------------------------- +Now it's time for a more real-life example (i.e. with errors in the code). We +will create two groups that branch directly from the root node, Particles and +Events. Then, we will put three tables in each group. In Particles we will +put tables based on the Particle descriptor and in Events, the tables based +the Event descriptor. + +Afterwards, we will provision the tables with a number of records. Finally, +we will read the newly-created table /Events/TEvent3 and select some values +from it, using a comprehension list. + +Look at the next script (you can find it in :file:`examples/tutorial2.py`). +It appears to do all of the above, but it contains some small bugs. Note that +this Particle class is not directly related to the one defined in +last tutorial; this class is simpler (note, however, the *multidimensional* +columns called pressure and temperature). + +We also introduce a new manner to describe a Table as a structured NumPy +dtype (or even as a dictionary), as you can see in the Event description. See +:meth:`File.create_table` for different kinds of descriptor objects that +can be passed to this method:: + + import tables as tb + import numpy as np + + # Describe a particle record + class Particle(tb.IsDescription): + name = tb.StringCol(itemsize=16) # 16-character string + lati = tb.Int32Col() # integer + longi = tb.Int32Col() # integer + pressure = tb.Float32Col(shape=(2,3)) # array of floats (single-precision) + temperature = tb.Float64Col(shape=(2,3)) # array of doubles (double-precision) + + # Native NumPy dtype instances are also accepted + Event = np.dtype([ + ("name" , "S16"), + ("TDCcount" , np.uint8), + ("ADCcount" , np.uint16), + ("xcoord" , np.float32), + ("ycoord" , np.float32) + ]) + + # And dictionaries too (this defines the same structure as above) + # Event = { + # "name" : tb.StringCol(itemsize=16), + # "TDCcount" : tb.UInt8Col(), + # "ADCcount" : tb.UInt16Col(), + # "xcoord" : tb.Float32Col(), + # "ycoord" : tb.Float32Col(), + # } + + # Open a file in "w"rite mode + fileh = tb.open_file("tutorial2.h5", mode="w") + + # Get the HDF5 root group + root = fileh.root + + # Create the groups: + for groupname in ("Particles", "Events"): + group = fileh.create_group(root, groupname) + + # Now, create and fill the tables in Particles group + gparticles = root.Particles + + # Create 3 new tables + for tablename in ("TParticle1", "TParticle2", "TParticle3"): + # Create a table + table = fileh.create_table("/Particles", tablename, Particle, "Particles: "+tablename) + + # Get the record object associated with the table: + particle = table.row + + # Fill the table with 257 particles + for i in range(257): + # First, assign the values to the Particle record + particle['name'] = f'Particle: {i:6d}' + particle['lati'] = i + particle['longi'] = 10 - i + + ########### Detectable errors start here. Play with them! + particle['pressure'] = i * np.arange(2 * 4).reshape(2, 4) # Incorrect + #particle['pressure'] = i * np.arange(2 * 3).reshape(2, 3) # Correct + ########### End of errors + + particle['temperature'] = i ** 2 # Broadcasting + + # This injects the Record values + particle.append() + + # Flush the table buffers + table.flush() + + # Now, go for Events: + for tablename in ("TEvent1", "TEvent2", "TEvent3"): + # Create a table in Events group + table = fileh.create_table(root.Events, tablename, Event, "Events: "+tablename) + + # Get the record object associated with the table: + event = table.row + + # Fill the table with 257 events + for i in range(257): + # First, assign the values to the Event record + event['name'] = f'Event: {i:6d}' + event['TDCcount'] = i % (1<<8) # Correct range + + ########### Detectable errors start here. Play with them! + event['xcoor'] = float(i ** 2) # Wrong spelling + #event['xcoord'] = float(i ** 2) # Correct spelling + event['ADCcount'] = "sss" # Wrong type + #event['ADCcount'] = i * 2 # Correct type + ########### End of errors + + event['ycoord'] = float(i) ** 4 + + # This injects the Record values + event.append() + + # Flush the buffers + table.flush() + + # Read the records from table "/Events/TEvent3" and select some + table = root.Events.TEvent3 + e = [ p['TDCcount'] for p in table if p['ADCcount'] < 20 and 4 <= p['TDCcount'] < 15 ] + print(f"Last record ==> {p}") + print("Selected values ==> {e}") + print("Total selected records ==> {len(e)}") + + # Finally, close the file (this also will flush all the remaining buffers!) + fileh.close() + + +Shape checking +~~~~~~~~~~~~~~ +If you look at the code carefully, you'll see that it won't work. You will +get the following error. + +.. code-block:: bash + + $ python3 tutorial2.py + Traceback (most recent call last): + File "tutorial2.py", line 60, in + particle['pressure'] = array(i * arange(2 * 3)).reshape((2, 4)) # Incorrect + ValueError: total size of new array must be unchanged + Closing remaining open files: tutorial2.h5... done + +This error indicates that you are trying to assign an array with an +incompatible shape to a table cell. Looking at the source, we see that we +were trying to assign an array of shape (2,4) to a pressure element, which +was defined with the shape (2,3). + +In general, these kinds of operations are forbidden, with one valid +exception: when you assign a *scalar* value to a multidimensional column +cell, all the cell elements are populated with the value of the scalar. +For example:: + + particle['temperature'] = i ** 2 # Broadcasting + +The value i**2 is assigned to all the elements of the temperature table cell. +This capability is provided by the NumPy package and is known as +*broadcasting*. + + +Field name checking +~~~~~~~~~~~~~~~~~~~ +After fixing the previous error and rerunning the program, we encounter +another error. + +.. code-block:: bash + + $ python3 tutorial2.py + Traceback (most recent call last): + File "tutorial2.py", line 73, in ? + event['xcoor'] = float(i ** 2) # Wrong spelling + File "tableextension.pyx", line 1094, in tableextension.Row.__setitem__ + File "tableextension.pyx", line 127, in tableextension.get_nested_field_cache + File "utilsextension.pyx", line 331, in utilsextension.get_nested_field + KeyError: 'no such column: xcoor' + +This error indicates that we are attempting to assign a value to a +non-existent field in the *event* table object. By looking carefully at the +Event class attributes, we see that we misspelled the xcoord field (we wrote +xcoor instead). This is unusual behavior for Python, as normally when you +assign a value to a non-existent instance variable, Python creates a new +variable with that name. Such a feature can be dangerous when dealing with an +object that contains a fixed list of field names. PyTables checks that the +field exists and raises a KeyError if the check fails. + + +Data type checking +~~~~~~~~~~~~~~~~~~ +Finally, the last issue which we will find here is a TypeError exception. + +.. code-block:: bash + + $ python3 tutorial2.py + Traceback (most recent call last): + File "tutorial2.py", line 75, in ? + event['ADCcount'] = "sss" # Wrong type + File "tableextension.pyx", line 1111, in tableextension.Row.__setitem__ + TypeError: invalid type () for column ``ADCcount`` + +And, if we change the affected line to read:: + + event.ADCcount = i * 2 # Correct type + +we will see that the script ends well. + +You can see the structure created with this (corrected) script in +:ref:`Figure 4 `. +In particular, note the multidimensional column cells in table +/Particles/TParticle2. + +.. _tutorial2-tableview: + +.. figure:: images/tutorial2-tableview.png + :align: center + + **Figure 4. Table hierarchy for tutorial 2.** + + +.. _LinksTutorial: + +Using links for more convenient access to nodes +----------------------------------------------- +Links are special nodes that can be used to create additional paths to your +existing nodes. PyTables supports three kinds of links: hard links, soft +links (aka symbolic links) and external links. + +Hard links let the user create additional paths to access another node in the +same file, and once created, they are indistinguishable from the referred +node object, except that they have different paths in the object tree. For +example, if the referred node is, say, a Table object, then the new hard link +will become a Table object itself. From this point on, you will be able to +access the same Table object from two different paths: the original one and +the new hard link path. If you delete one path to the table, you will be +able to reach it via the other path. + +Soft links are similar to hard links, but they keep their own personality. +When you create a soft link to another node, you will get a new SoftLink +object that *refers* to that node. However, in order to access the referred +node, you need to *dereference* it. + +Finally, external links are like soft links, with the difference that these +are meant to point to nodes in *external* files instead of nodes in the same +file. They are represented by the ExternalLink class and, like soft links, +you need to dereference them in order to get access to the pointed node. + + +Interactive example +~~~~~~~~~~~~~~~~~~~ +Now we are going to learn how to deal with links. You can find the code used +in this section in :file:`examples/links.py`. + +First, let's create a file with some group structure:: + + >>> import tables as tb + >>> f1 = tb.open_file('links1.h5', 'w') + >>> g1 = f1.create_group('/', 'g1') + >>> g2 = f1.create_group(g1, 'g2') + +Now, we will put some datasets on the /g1 and /g1/g2 groups:: + + >>> a1 = f1.create_carray(g1, 'a1', tb.Int64Atom(), shape=(10000,)) + >>> t1 = f1.create_table(g2, 't1', {'f1': tb.IntCol(), 'f2': tb.FloatCol()}) + +We can start the party now. We are going to create a new group, say /gl, +where we will put our links and will start creating one hard link too:: + + >>> gl = f1.create_group('/', 'gl') + >>> ht = f1.create_hard_link(gl, 'ht', '/g1/g2/t1') # ht points to t1 + >>> print(f"``{ht}`` is a hard link to: ``{t1}``") + ``/gl/ht (Table(0,)) `` is a hard link to: ``/g1/g2/t1 (Table(0,)) `` + +You can see how we've created a hard link in /gl/ht which is pointing to the +existing table in /g1/g2/t1. Have look at how the hard link is represented; +it looks like a table, and actually, it is a *real* table. We have two +different paths to access that table, the original /g1/g2/t1 and the new one +/gl/ht. If we remove the original path we still can reach the table by using +the new path:: + + >>> t1.remove() + >>> print(f"table continues to be accessible in: ``{f1.get_node('/gl/ht')}``") + table continues to be accessible in: ``/gl/ht (Table(0,)) `` + +So far so good. Now, let's create a couple of soft links:: + + >>> la1 = f1.create_soft_link(gl, 'la1', '/g1/a1') # la1 points to a1 + >>> print(f"``{la1}`` is a soft link to: ``{la1.target}``") + ``/gl/la1 (SoftLink) -> /g1/a1`` is a soft link to: ``/g1/a1`` + >>> lt = f1.create_soft_link(gl, 'lt', '/g1/g2/t1') # lt points to t1 + >>> print(f"``{lt}`` is a soft link to: ``{lt.target}``") + ``/gl/lt (SoftLink) -> /g1/g2/t1 (dangling)`` is a soft link to: ``/g1/g2/t1`` + +Okay, we see how the first link /gl/la1 points to the array /g1/a1. Notice +how the link prints as a SoftLink, and how the referred node is stored in the +target instance attribute. The second link (/gt/lt) pointing to /g1/g2/t1 +also has been created successfully, but by better inspecting the string +representation of it, we see that is labeled as '(dangling)'. Why is this? +Well, you should remember that we recently removed the /g1/g2/t1 path to +access table t1. When printing it, the object knows that it points to +*nowhere* and reports this. +This is a nice way to quickly know whether a soft link points to an exiting +node or not. + +So, let's re-create the removed path to t1 table:: + + >>> t1 = f1.create_hard_link('/g1/g2', 't1', '/gl/ht') + >>> print(f"``{lt}`` is not dangling anymore") + ``/gl/lt (SoftLink) -> /g1/g2/t1`` is not dangling anymore + +and the soft link is pointing to an existing node now. + +Of course, for soft links to serve any actual purpose we need a way to get +the pointed node. It happens that soft links are callable, and that's the +way to get the referred nodes back:: + + >>> plt = lt() + >>> print(f"dereferred lt node: ``{plt}``") + dereferred lt node: ``/g1/g2/t1 (Table(0,)) `` + >>> pla1 = la1() + >>> print(f"dereferred la1 node: ``{pla1}``") + dereferred la1 node: ``/g1/a1 (CArray(10000,)) `` + +Now, plt is a Python reference to the t1 table while pla1 refers to the a1 +array. Easy, uh? + +Let's suppose now that a1 is an array whose access speed is critical for our +application. One possible solution is to move the entire file into a faster +disk, say, a solid state disk so that access latencies can be reduced quite a +lot. However, it happens that our file is too big to fit into our shiny new +(although small in capacity) SSD disk. A solution is to copy just the a1 +array into a separate file that would fit into our SSD disk. However, our +application would be able to handle two files instead of only one, adding +significantly more complexity, which is not a good thing. + +External links to the rescue! As we've already said, external links are like +soft links, but they are designed to link objects in external files. Back to +our problem, let's copy the a1 array into a different file:: + + >>> f2 = tb.open_file('links2.h5', 'w') + >>> new_a1 = a1.copy(f2.root, 'a1') + >>> f2.close() # close the other file + +And now, we can remove the existing soft link and create the external link in +its place:: + + >>> la1.remove() + >>> la1 = f1.create_external_link(gl, 'la1', 'links2.h5:/a1') + >>> print(f"``{la1}`` is an external link to: ``{la1.target}``") + ``/gl/la1 (ExternalLink) -> links2.h5:/a1`` is an external link to: ``links2.h5:/a1`` + +Let's try dereferring it:: + + >>> new_a1 = la1() # dereferrencing la1 returns a1 in links2.h5 + >>> print(f"dereferred la1 node: ``{new_a1}``") + dereferred la1 node: ``/a1 (CArray(10000,)) `` + +Well, it seems like we can access the external node. But just to make sure +that the node is in the other file:: + + >>> print("new_a1 file:", new_a1._v_file.filename) + new_a1 file: links2.h5 + +Okay, the node is definitely in the external file. So, you won't have to +worry about your application: it will work exactly the same no matter the +link is internal (soft) or external. + +Finally, here it is a dump of the objects in the final file, just to get a +better idea of what we ended with:: + + >>> f1.close() + >>> exit() + $ ptdump links1.h5 + / (RootGroup) '' + /g1 (Group) '' + /g1/a1 (CArray(10000,)) '' + /gl (Group) '' + /gl/ht (Table(0,)) '' + /gl/la1 (ExternalLink) -> links2.h5:/a1 + /gl/lt (SoftLink) -> /g1/g2/t1 + /g1/g2 (Group) '' + /g1/g2/t1 (Table(0,)) '' + +This ends this tutorial. I hope it helped you to appreciate how useful links +can be. I'm sure you will find other ways in which you can use links that +better fit your own needs. + + +Exercising the Undo/Redo feature +-------------------------------- +PyTables has integrated support for undoing and/or redoing actions. This +functionality lets you put marks in specific places of your hierarchy +manipulation operations, so that you can make your HDF5 file pop back +(*undo*) to a specific mark (for example for inspecting how your hierarchy +looked at that point). You can also go forward to a more recent marker +(*redo*). You can even do jumps to the marker you want using just one +instruction as we will see shortly. + +You can undo/redo all the operations that are related to object tree +management, like creating, deleting, moving or renaming nodes (or complete +sub-hierarchies) inside a given object tree. You can also undo/redo +operations (i.e. creation, deletion or modification) of persistent node +attributes. However, actions which include *internal* modifications of datasets +(that includes Table.append, Table.modify_rows or Table.remove_rows, among others) +cannot be currently undone/redone. + +This capability can be useful in many situations, for example when doing +simulations with multiple branches. When you have to choose a path to follow +in such a situation, you can put a mark there and, if the simulation is not +going well, you can go back to that mark and start another path. Another +possible application is defining coarse-grained operations which operate in a +transactional-like way, i.e. which return the database to its previous state +if the operation finds some kind of problem while running. You can probably +devise many other scenarios where the Undo/Redo feature can be useful to you +[3]_. + + +A basic example +~~~~~~~~~~~~~~~ +In this section, we are going to show the basic behavior of the Undo/Redo +feature. You can find the code used in this example in +:file:`examples/tutorial3-1.py`. A somewhat more complex example will be +explained in the next section. + +First, let's create a file:: + + >>> import tables + >>> fileh = tables.open_file("tutorial3-1.h5", "w", title="Undo/Redo demo 1") + +And now, activate the Undo/Redo feature with the method +:meth:`File.enable_undo` of File:: + + >>> fileh.enable_undo() + +From now on, all our actions will be logged internally by PyTables. Now, we +are going to create a node (in this case an Array object):: + + >>> one = fileh.create_array('/', 'anarray', [3,4], "An array") + +Now, mark this point:: + + >>> fileh.mark() + 1 + +We have marked the current point in the sequence of actions. +In addition, the mark() method has returned the identifier assigned to this +new mark, that is 1 (mark #0 is reserved for the implicit mark at the +beginning of the action log). In the next section we will see that you can +also assign a *name* to a mark (see :meth:`File.mark` for more info on +mark()). +Now, we are going to create another array:: + + >>> another = fileh.create_array('/', 'anotherarray', [4,5], "Another array") + +Right. Now, we can start doing funny things. Let's say that we want to pop +back to the previous mark (that whose value was 1, do you remember?). Let's +introduce the undo() method (see :meth:`File.undo`):: + + >>> fileh.undo() + +Fine, what do you think happened? Well, let's have a look at the object +tree:: + + >>> print(fileh) + tutorial3-1.h5 (File) 'Undo/Redo demo 1' + Last modif.: 'Tue Mar 13 11:43:55 2007' + Object Tree: + / (RootGroup) 'Undo/Redo demo 1' + /anarray (Array(2,)) 'An array' + +What happened with the /anotherarray node we've just created? You've guessed it, +it has disappeared because it was created *after* the mark 1. If you are +curious enough you may ask - where has it gone? Well, it has not been +deleted completely; it has just been moved into a special, hidden group of +PyTables that renders it invisible and waiting for a chance to be reborn. + +Now, unwind once more, and look at the object tree:: + + >>> fileh.undo() + >>> print(fileh) + tutorial3-1.h5 (File) 'Undo/Redo demo 1' + Last modif.: 'Tue Mar 13 11:43:55 2007' + Object Tree: + / (RootGroup) 'Undo/Redo demo 1' + +Oops, /anarray has disappeared as well! +Don't worry, it will visit us again very shortly. So, you might be somewhat lost +right now; at which mark are we? Let's ask the :meth:`File.get_current_mark` +method in the file handler:: + + >>> print(fileh.get_current_mark()) + 0 + +So we are at mark #0, remember? Mark #0 is an implicit mark that is created +when you start the log of actions when calling File.enable_undo(). Fine, but +you are missing your too-young-to-die arrays. What can we do about that? +:meth:`File.redo` to the rescue:: + + >>> fileh.redo() + >>> print(fileh) + tutorial3-1.h5 (File) 'Undo/Redo demo 1' + Last modif.: 'Tue Mar 13 11:43:55 2007' + Object Tree: + / (RootGroup) 'Undo/Redo demo 1' + /anarray (Array(2,)) 'An array' + +Great! The /anarray array has come into life again. Just check that it is +alive and well:: + + >>> fileh.root.anarray.read() + [3, 4] + >>> fileh.root.anarray.title + 'An array' + +Well, it looks pretty similar to its past life; +what's more, it is exactly the same object:: + + >>> fileh.root.anarray is one + True + +It was just moved to the hidden group and back again, that's all! +That's kind of fun, so we are going to do the same with /anotherarray:: + + >>> fileh.redo() + >>> print(fileh) + tutorial3-1.h5 (File) 'Undo/Redo demo 1' + Last modif.: 'Tue Mar 13 11:43:55 2007' + Object Tree: + / (RootGroup) 'Undo/Redo demo 1' + /anarray (Array(2,)) 'An array' + /anotherarray (Array(2,)) 'Another array' + +Welcome back, /anotherarray! Just a couple of sanity checks:: + + >>> assert fileh.root.anotherarray.read() == [4,5] + >>> assert fileh.root.anotherarray.title == "Another array" + >>> fileh.root.anotherarray is another + True + +Nice, you managed to turn your data back to life. +Congratulations! But wait, do not forget to close your action log when you +don't need this feature anymore:: + + >>> fileh.disable_undo() + +That will allow you to continue working with your data without actually +requiring PyTables to keep track of all your actions, and more importantly, +allowing your objects to die completely if they have to, not requiring to +keep them anywhere, hence saving process time and space in your database +file. + + +A more complete example +~~~~~~~~~~~~~~~~~~~~~~~ +Now, time for a somewhat more sophisticated demonstration of the Undo/Redo +feature. In it, several marks will be set in different parts of the code flow +and we will see how to jump between these marks with just one method call. +You can find the code used in this example in :file:`examples/tutorial3-2.py` + +Let's introduce the first part of the code:: + + import tables + + # Create an HDF5 file + fileh = tables.open_file('tutorial3-2.h5', 'w', title='Undo/Redo demo 2') + + #'-**-**-**-**-**-**- enable undo/redo log -**-**-**-**-**-**-**-' + fileh.enable_undo() + + # Start undoable operations + fileh.create_array('/', 'otherarray1', [3,4], 'Another array 1') + fileh.create_group('/', 'agroup', 'Group 1') + + # Create a 'first' mark + fileh.mark('first') + fileh.create_array('/agroup', 'otherarray2', [4,5], 'Another array 2') + fileh.create_group('/agroup', 'agroup2', 'Group 2') + + # Create a 'second' mark + fileh.mark('second') + fileh.create_array('/agroup/agroup2', 'otherarray3', [5,6], 'Another array 3') + + # Create a 'third' mark + fileh.mark('third') + fileh.create_array('/', 'otherarray4', [6,7], 'Another array 4') + fileh.create_array('/agroup', 'otherarray5', [7,8], 'Another array 5') + +You can see how we have set several marks interspersed in the code flow, +representing different states of the database. Also, note that we have +assigned *names* to these marks, namely 'first', 'second' and 'third'. + +Now, start doing some jumps back and forth in the states of the database:: + + # Now go to mark 'first' + fileh.goto('first') + assert '/otherarray1' in fileh + assert '/agroup' in fileh + assert '/agroup/agroup2' not in fileh + assert '/agroup/otherarray2' not in fileh + assert '/agroup/agroup2/otherarray3' not in fileh + assert '/otherarray4' not in fileh + assert '/agroup/otherarray5' not in fileh + + # Go to mark 'third' + fileh.goto('third') + assert '/otherarray1' in fileh + assert '/agroup' in fileh + assert '/agroup/agroup2' in fileh + assert '/agroup/otherarray2' in fileh + assert '/agroup/agroup2/otherarray3' in fileh + assert '/otherarray4' not in fileh + assert '/agroup/otherarray5' not in fileh + + # Now go to mark 'second' + fileh.goto('second') + assert '/otherarray1' in fileh + assert '/agroup' in fileh + assert '/agroup/agroup2' in fileh + assert '/agroup/otherarray2' in fileh + assert '/agroup/agroup2/otherarray3' not in fileh + assert '/otherarray4' not in fileh + assert '/agroup/otherarray5' not in fileh + +Well, the code above shows how easy is to jump to a certain mark in the +database by using the :meth:`File.goto` method. + +There are also a couple of implicit marks for going to the beginning or the +end of the saved states: 0 and -1. Going to mark #0 means go to the beginning +of the saved actions, that is, when method fileh.enable_undo() was called. +Going to mark #-1 means go to the last recorded action, that is the last +action in the code flow. + +Let's see what happens when going to the end of the action log:: + + # Go to the end + fileh.goto(-1) + assert '/otherarray1' in fileh + assert '/agroup' in fileh + assert '/agroup/agroup2' in fileh + assert '/agroup/otherarray2' in fileh + assert '/agroup/agroup2/otherarray3' in fileh + assert '/otherarray4' in fileh + assert '/agroup/otherarray5' in fileh + + # Check that objects have come back to life in a sane state + assert fileh.root.otherarray1.read() == [3,4] + assert fileh.root.agroup.otherarray2.read() == [4,5] + assert fileh.root.agroup.agroup2.otherarray3.read() == [5,6] + assert fileh.root.otherarray4.read() == [6,7] + assert fileh.root.agroup.otherarray5.read() == [7,8] + +Try going to the beginning of the action log yourself (remember, mark #0) +and check contents of the object tree. + +We have nearly finished this demonstration. As always, do not forget to close +the action log as well as the database:: + + #'-**-**-**-**-**-**- disable undo/redo log -**-**-**-**-**-**-**-' + fileh.disable_undo() + # Close the file + fileh.close() + +You might want to check other examples on Undo/Redo feature that appear in +:file:`examples/undo-redo.py`. + + +Using enumerated types +---------------------- +PyTables includes support for handling enumerated types. Those types are +defined by providing an exhaustive *set* or *list* of possible, named values +for a variable of that type. Enumerated variables of the same type are +usually compared between them for equality and sometimes for order, but are +not usually operated upon. + +Enumerated values have an associated *name* and *concrete value*. Every name +is unique and so are concrete values. An enumerated variable always takes the +concrete value, not its name. Usually, the concrete value is not used +directly, and frequently it is entirely irrelevant. For the same reason, an +enumerated variable is not usually compared with concrete values out of its +enumerated type. For that kind of use, standard variables and constants are +more adequate. + +PyTables provides the Enum (see :ref:`EnumClassDescr`) class to provide +support for enumerated types. Each instance of Enum is an enumerated type (or +*enumeration*). For example, let us create an enumeration of colors + +All these examples can be found in :file:`examples/play-with-enums.py`:: + + >>> import tables + >>> colorList = ['red', 'green', 'blue', 'white', 'black'] + >>> colors = tables.Enum(colorList) + +Here we used a simple list giving the names of enumerated values, but we left +the choice of concrete values up to the Enum class. Let us see the enumerated +pairs to check those values:: + + >>> print("Colors:", [v for v in colors]) + Colors: [('blue', 2), ('black', 4), ('white', 3), ('green', 1), ('red', 0)] + +Names have been given automatic integer concrete values. We can iterate over +values in an enumeration, but we will usually be more interested in +accessing single values. We can get the concrete value associated with a name +by accessing it as an attribute or as an item (the later can be useful for +names not resembling Python identifiers):: + + >>> print("Value of 'red' and 'white':", (colors.red, colors.white)) + Value of 'red' and 'white': (0, 3) + >>> print("Value of 'yellow':", colors.yellow) + Value of 'yellow': + Traceback (most recent call last): + File "", line 1, in ? + File ".../tables/misc/enum.py", line 230, in __getattr__ + raise AttributeError(\*ke.args) + AttributeError: no enumerated value with that name: 'yellow' + >>> + >>> print("Value of 'red' and 'white':", (colors['red'], colors['white'])) + Value of 'red' and 'white': (0, 3) + >>> print("Value of 'yellow':", colors['yellow']) + Value of 'yellow': + Traceback (most recent call last): + File "", line 1, in ? + File ".../tables/misc/enum.py", line 189, in __getitem__ + raise KeyError("no enumerated value with that name: %r" % (name,)) + KeyError: "no enumerated value with that name: 'yellow'" + +See how accessing a value that is not in the enumeration raises the +appropriate exception. We can also do the opposite and get the name +that matches a concrete value by using the __call__() method of Enum:: + + >>> print(f"Name of value {colors.red}:", colors(colors.red)) + Name of value 0: red + >>> print("Name of value 1234:", colors(1234)) + Name of value 1234: + Traceback (most recent call last): + File "", line 1, in ? + File ".../tables/misc/enum.py", line 320, in __call__ + raise ValueError( + ValueError: no enumerated value with that concrete value: 1234 + +You can see what we made as using the enumerated type to *convert* a concrete +value into a name in the enumeration. Of course, values out of the +enumeration can not be converted. + + +Enumerated columns +~~~~~~~~~~~~~~~~~~ +Columns of an enumerated type can be declared by using the EnumCol (see +:ref:`ColClassDescr`) class. To see how this works, let us open a new +PyTables file and create a table to collect the simulated results of a +probabilistic experiment. In it, we have a bag full of colored balls; we take +a ball out and annotate the time of extraction and the color of the ball:: + + >>> h5f = tables.open_file('enum.h5', 'w') + >>> class BallExt(tables.IsDescription): + ... ballTime = tables.Time32Col() + ... ballColor = tables.EnumCol(colors, 'black', base='uint8') + >>> tbl = h5f.create_table('/', 'extractions', BallExt, title="Random ball extractions") + >>> + +We declared the ballColor column to be of the enumerated type colors, with a +default value of black. We also stated that we are going to store concrete +values as unsigned 8-bit integer values [4]_. + +Let us use some random values to fill the table:: + + >>> import time + >>> import random + >>> now = time.time() + >>> row = tbl.row + >>> for i in range(10): + ... row['ballTime'] = now + i + ... row['ballColor'] = colors[random.choice(colorList)] # take note of this + ... row.append() + >>> + +Notice how we used the __getitem__() call of colors to get the concrete value +to store in ballColor. This way of appending values to a table automatically checks +for the validity on enumerated values. For instance:: + + >>> row['ballTime'] = now + 42 + >>> row['ballColor'] = 1234 + Traceback (most recent call last): + File "", line 1, in + File "tableextension.pyx", line 1086, in tableextension.Row.__setitem__ + File ".../tables/misc/enum.py", line 320, in __call__ + "no enumerated value with that concrete value: %r" % (value,)) + ValueError: no enumerated value with that concrete value: 1234 + +Note that this check is performed *only* by row.append() and not in other +methods such as tbl.append() or tbl.modify_rows(). Now, after flushing the +table we can see the result of insertions:: + + >>> tbl.flush() + >>> for r in tbl: + ... ballTime = r['ballTime'] + ... ballColor = colors(r['ballColor']) # notice this + ... print("Ball extracted on %d is of color %s." % (ballTime, ballColor)) + Ball extracted on 1173785568 is of color green. + Ball extracted on 1173785569 is of color black. + Ball extracted on 1173785570 is of color white. + Ball extracted on 1173785571 is of color black. + Ball extracted on 1173785572 is of color black. + Ball extracted on 1173785573 is of color red. + Ball extracted on 1173785574 is of color green. + Ball extracted on 1173785575 is of color red. + Ball extracted on 1173785576 is of color white. + Ball extracted on 1173785577 is of color white. + +As a final note, you may be wondering how to access an enumeration associated +with ballColor once the file is closed and reopened. You can call tbl.get_enum('ballColor') +(see :meth:`Table.get_enum`) to get the enumeration back. + + +Enumerated arrays +~~~~~~~~~~~~~~~~~ +EArray and VLArray leaves can also be declared to store enumerated values by +means of the EnumAtom (see :ref:`AtomClassDescr`) class, which works very +much like EnumCol for tables. Also, Array leaves can be used to open native +HDF enumerated arrays. + +Let us create a sample EArray containing ranges of working days as +bidimensional values:: + + >>> workingDays = {'Mon': 1, 'Tue': 2, 'Wed': 3, 'Thu': 4, 'Fri': 5} + >>> dayRange = tables.EnumAtom(workingDays, 'Mon', base='uint16') + >>> earr = h5f.create_earray('/', 'days', dayRange, (0, 2), title="Working day ranges") + >>> earr.flavor = 'python' + +Nothing unusual, except for two details. Firstly, we use +a *dictionary* instead of a list to explicitly set concrete values in the +enumeration. Secondly, there is no explicit Enum instance created! +Instead, the dictionary is passed as the first argument to the constructor of +EnumAtom. If the constructor receives a list or a dictionary instead of an +enumeration, it automatically builds the enumeration from it. + +Now let us feed some data to the array:: + + >>> wdays = earr.get_enum() + >>> earr.append([(wdays.Mon, wdays.Fri), (wdays.Wed, wdays.Fri)]) + >>> earr.append([(wdays.Mon, 1234)]) + +Please note that, since we had no explicit Enum instance, we were forced to +use get_enum() (see :ref:`EArrayMethodsDescr`) to get it from the array (we +could also have used dayRange.enum). Also note that we were able to append +an invalid value (1234). Array methods do not check the validity of +enumerated values. + +Finally, we will print contents of the array:: + + >>> for (d1, d2) in earr: + ... print(f"From {wdays(d1) to {wdays(d2) ({d2 - d1 + 1} days).") + From Mon to Fri (5 days). + From Wed to Fri (3 days). + Traceback (most recent call last): + File "", line 2, in + File ".../tables/misc/enum.py", line 320, in __call__ + "no enumerated value with that concrete value: %r" % (value,)) + ValueError: no enumerated value with that concrete value: 1234 + +That was an example of operating on concrete values. It also showed how the +value-to-name conversion failed because of the value not belonging to the +enumeration. + +Now we will close the file, and this little tutorial on enumerated types is +done:: + + >>> h5f.close() + + +Nested structures in tables +---------------------------------------- +PyTables supports handling of nested structures (or, in other words, nested datatypes) +in table objects, allowing you to define nested columns of arbitrary depth. + +Why is that useful? Suppose your data has a certain structure on the column level, +which you would like to represent in the data model. You can do so by creating +nested subclasses of IsDescription. The benefit is the ability to group +and retrieve data more easily. Example below may be a bit silly, but it will serve +as an illustration of the concept:: + + import tables as tb + + class Info(tb.IsDescription): + """A sub-structure of NestedDescr""" + _v_pos = 2 # The position in the whole structure + name = tb.StringCol(10) + value = tb.Float64Col(pos=0) + + colors = tb.Enum(['red', 'green', 'blue']) + + class NestedDescr(tb.IsDescription): + """A description that has several nested columns""" + color = tb.EnumCol(colors, 'red', base='uint32') + info1 = tb.Info() + + class info2(tb.IsDescription): + _v_pos = 1 + name = tb.StringCol(10) + value = tb.Float64Col(pos=0) + + class info3(tb.IsDescription): + x = tb.Float64Col(dflt=1) + y = tb.UInt8Col(dflt=1) + +NestedDescr is the root class with two *substructures* in it: info1 and info2. Note info1 is an instance of class Info which is defined prior to NestedDescr. info2 is declared within NestedDescr. Also, there is a third substructure, info3, that is in turn declared within substructure info2. You can define positions of substructures in the containing object by declaring special class attribute _v_pos. + + +Creating nested tables +~~~~~~~~~~~~~~~~~~~~~~ +Now that we have defined our nested structure, let's create a *nested* table, +that is a table with columns which contain subcolumns:: + + >>> fileh = tb.open_file("nested-tut.h5", "w") + >>> table = fileh.create_table(fileh.root, 'table', NestedDescr) + +Done! Now, to populate the table with values, assign a value to each field. Referencing nested fields can be accomplished by providing a full path. Follow the structure defined earlier - use '/' to access each sublevel of the structure, similar to how you would access a subdirectory on a Unix filesystem:: + + >>> row = table.row + >>> for i in range(10): + ... row['color'] = colors[['red', 'green', 'blue'][i % 3]] + ... row['info1/name'] = f"name1-{i}" + ... row['info2/name'] = f"name2-{i}" + ... row['info2/info3/y'] = i + ... # Remaining fields will be filled with defaults + ... row.append() + >>> table.flush() + >>> table.nrows + 10 + +As demonstrated above, substructure's field can be accessed by specifying its full path as defined in the table hierarchy. + + +Reading nested tables +~~~~~~~~~~~~~~~~~~~~~ +Now, what happens if we want to read the table? What kind of data container +would we get? Let's find out:: + + >>> nra = table[::4] + >>> nra + array([(((1.0, 0), 'name2-0', 0.0), ('name1-0', 0.0), 0L), + (((1.0, 4), 'name2-4', 0.0), ('name1-4', 0.0), 1L), + (((1.0, 8), 'name2-8', 0.0), ('name1-8', 0.0), 2L)], + dtype=[('info2', [('info3', [('x', '>f8'), ('y', '\|u1')]), + ('name', '\|S10'), ('value', '>f8')]), + ('info1', [('name', '\|S10'), ('value', '>f8')]), + ('color', '>u4')]) + +What we've got is a NumPy array with a *compound, nested datatype*, i.e. its dtype is +a list of name-datatype tuples. For every fourth row in the table (note [::4]), we get one resulting row, +giving a total of three. + +You can make use of the above object in many different ways. +For example, you can use it to append new data to an existing table object:: + + >>> table.append(nra) + >>> table.nrows + 13 + +Or to create new tables:: + + >>> table2 = fileh.create_table(fileh.root, 'table2', nra) + >>> table2[:] + array([(((1.0, 0), 'name2-0', 0.0), ('name1-0', 0.0), 0L), + (((1.0, 4), 'name2-4', 0.0), ('name1-4', 0.0), 1L), + (((1.0, 8), 'name2-8', 0.0), ('name1-8', 0.0), 2L)], + dtype=[('info2', [('info3', [('x', '>> names = [ x['info2/name'] for x in table if x['color'] == colors.red ] + >>> names + ['name2-0', 'name2-3', 'name2-6', 'name2-9', 'name2-0'] + +Note that row accessor does not provide natural naming feature, so +you have to specify an absolute path to your desired column in order to +access it. + + +Using Cols accessor +~~~~~~~~~~~~~~~~~~~ +We can use cols attribute object (see :ref:`ColsClassDescr`) of the table +to conveniently access data stored in a substructure:: + + >>> table.cols.info2[1:5] + array([((1.0, 1), 'name2-1', 0.0), ((1.0, 2), 'name2-2', 0.0), + ((1.0, 3), 'name2-3', 0.0), ((1.0, 4), 'name2-4', 0.0)], + dtype=[('info3', [('x', '>> table.cols.info2.info3[1:5] + array([(1.0, 1), (1.0, 2), (1.0, 3), (1.0, 4)], + dtype=[('x', '>> table.cols._f_col('info2') + /table.cols.info2 (Cols), 3 columns + info3 (Cols(), Description) + name (Column(), \|S10) + value (Column(), float64) + +Here, you've got another Cols object handler because *info2* was a nested +column. If you select a non-nested column, you will get a regular Column +instance:: + + >>> table.cols._f_col('info2/info3/y') + /table.cols.info2.info3.y (Column(), uint8, idx=None) + +To summarize, cols accessor is a very handy and powerful tool to access data +in nested tables. Don't hesitate to use it, especially when doing interactive work. + + +Accessing meta-information of nested tables +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Tables have a *description* attribute, which returns an instance of +the Description class (see :ref:`DescriptionClassDescr`) with table meta-data. +It can be helpful in understanding the table structure, including nested columns:: + + >>> table.description + { + "info2": { + "info3": { + "x": Float64Col(shape=(), dflt=1.0, pos=0), + "y": UInt8Col(shape=(), dflt=1, pos=1)}, + "name": StringCol(itemsize=10, shape=(), dflt='', pos=1), + "value": Float64Col(shape=(), dflt=0.0, pos=2)}, + "info1": { + "name": StringCol(itemsize=10, shape=(), dflt='', pos=0), + "value": Float64Col(shape=(), dflt=0.0, pos=1)}, + "color": EnumCol(enum=Enum({'blue': 2, 'green': 1, 'red': 0}), dflt='red', + base=UInt32Atom(shape=(), dflt=0), shape=(), pos=2)} + +As you can see, it provides very useful information on both the format and +the structure of columns in your table. + +You can use natural naming approach with the description attribute to gain access to subcolumn's +meta-data:: + + >>> table.description.info1 + {"name": StringCol(itemsize=10, shape=(), dflt='', pos=0), + "value": Float64Col(shape=(), dflt=0.0, pos=1)} + >>> table.description.info2.info3 + {"x": Float64Col(shape=(), dflt=1.0, pos=0), + "y": UInt8Col(shape=(), dflt=1, pos=1)} + +_v_nested_names attribute provides names of columns as well as structures embedded in them:: + + >>> table.description._v_nested_names + [('info2', [('info3', ['x', 'y']), 'name', 'value']), + ('info1', ['name', 'value']), 'color'] + >>> table.description.info1._v_nested_names + ['name', 'value'] + +Regardless of which level of the structure is accessed, the output of _v_nested_names contains +the same kind of information. This is because even for nested structures, a Description +object is returned. + +It is possible to create arrays that immitate nested table-like structure with _v_nested_descr attribute:: + + >>> import numpy + >>> table.description._v_nested_descr + [('info2', [('info3', [('x', '()f8'), ('y', '()u1')]), ('name', '()S10'), + ('value', '()f8')]), ('info1', [('name', '()S10'), ('value', '()f8')]), + ('color', '()u4')] + >>> numpy.rec.array(None, shape=0, + dtype=table.description._v_nested_descr) + recarray([], + dtype=[('info2', [('info3', [('x', '>f8'), ('y', '|u1')]), + ('name', '|S10'), ('value', '>f8')]), + ('info1', [('name', '|S10'), ('value', '>f8')]), + ('color', '>u4')]) + >>> numpy.rec.array(None, shape=0, + dtype=table.description.info2._v_nested_descr) + recarray([], + dtype=[('info3', [('x', '>f8'), ('y', '|u1')]), ('name', '|S10'), + ('value', '>f8')]) + +Last but not least, there is a special iterator of the Description class: _f_walk, +which returns different columns of the table:: + + >>> for coldescr in table.description._f_walk(): + ... print(f"column--> {coldescr}") + column--> Description([('info2', [('info3', [('x', '()f8'), ('y', '()u1')]), + ('name', '()S10'), ('value', '()f8')]), + ('info1', [('name', '()S10'), ('value', '()f8')]), + ('color', '()u4')]) + column--> EnumCol(enum=Enum({'blue': 2, 'green': 1, 'red': 0}), dflt='red', + base=UInt32Atom(shape=(), dflt=0), shape=(), pos=2) + column--> Description([('info3', [('x', '()f8'), ('y', '()u1')]), ('name', '()S10'), + ('value', '()f8')]) + column--> StringCol(itemsize=10, shape=(), dflt='', pos=1) + column--> Float64Col(shape=(), dflt=0.0, pos=2) + column--> Description([('name', '()S10'), ('value', '()f8')]) + column--> StringCol(itemsize=10, shape=(), dflt='', pos=0) + column--> Float64Col(shape=(), dflt=0.0, pos=1) + column--> Description([('x', '()f8'), ('y', '()u1')]) + column--> Float64Col(shape=(), dflt=1.0, pos=0) + column--> UInt8Col(shape=(), dflt=1, pos=1) + +See the :ref:`DescriptionClassDescr` for a complete listing of attributes +and methods of the Description object. + +Well, this is the end of this tutorial. As always, remember to close +your files:: + + >>> fileh.close() + +Finally, you may want to have a look at your resulting data file. + +.. code-block:: bash + + $ ptdump -d nested-tut.h5 + / (RootGroup) '' + /table (Table(13,)) '' + Data dump: + [0] (((1.0, 0), 'name2-0', 0.0), ('name1-0', 0.0), 0L) + [1] (((1.0, 1), 'name2-1', 0.0), ('name1-1', 0.0), 1L) + [2] (((1.0, 2), 'name2-2', 0.0), ('name1-2', 0.0), 2L) + [3] (((1.0, 3), 'name2-3', 0.0), ('name1-3', 0.0), 0L) + [4] (((1.0, 4), 'name2-4', 0.0), ('name1-4', 0.0), 1L) + [5] (((1.0, 5), 'name2-5', 0.0), ('name1-5', 0.0), 2L) + [6] (((1.0, 6), 'name2-6', 0.0), ('name1-6', 0.0), 0L) + [7] (((1.0, 7), 'name2-7', 0.0), ('name1-7', 0.0), 1L) + [8] (((1.0, 8), 'name2-8', 0.0), ('name1-8', 0.0), 2L) + [9] (((1.0, 9), 'name2-9', 0.0), ('name1-9', 0.0), 0L) + [10] (((1.0, 0), 'name2-0', 0.0), ('name1-0', 0.0), 0L) + [11] (((1.0, 4), 'name2-4', 0.0), ('name1-4', 0.0), 1L) + [12] (((1.0, 8), 'name2-8', 0.0), ('name1-8', 0.0), 2L) + /table2 (Table(3,)) '' + Data dump: + [0] (((1.0, 0), 'name2-0', 0.0), ('name1-0', 0.0), 0L) + [1] (((1.0, 4), 'name2-4', 0.0), ('name1-4', 0.0), 1L) + [2] (((1.0, 8), 'name2-8', 0.0), ('name1-8', 0.0), 2L) + +Most of the code in this section is also available in +:file:`examples/nested-tut.py`. + +PyTables provides a comprehensive set of tools to work with nested structures and to address your classification needs. +Try to avoid nesting your data too deeply as it may lead to very long and convoluted series of lists, tuples, and description objects, which can be hard to read and understand. + + +Other examples in PyTables distribution +--------------------------------------- +Feel free to examine the rest of examples in directory :file:`examples/`, and +try to understand them. We have written several practical sample scripts to +give you an idea of the PyTables capabilities, its way of dealing with HDF5 +objects, and how it can be used in the real world. + +------------ + +.. [1] Appending data to arrays is also supported, but you need to create + special objects called EArray (see :ref:`EArrayClassDescr` for more + info). + +.. [2] Note that you can append not only scalar values to tables, + but also fully multidimensional array objects. + +.. [3] You can even *hide* nodes temporarily. Can you think of a way to do it? + +.. [4] In fact, only integer values are supported right now, but + this may change in the future. + + diff --git a/doc/source/usersguide/usersguide.rst b/doc/source/usersguide/usersguide.rst new file mode 100644 index 0000000..927dd90 --- /dev/null +++ b/doc/source/usersguide/usersguide.rst @@ -0,0 +1,116 @@ +:orphan: + +===================== +PyTables User's Guide +===================== + +.. raw:: latex + + \listoffigures + \listoftables + \clearpage + + +:Authors: Francesc Alted, Ivan Vilata, Scott Prater, Vicent Mas, Tom Hedley, + Antonio Valentino, Jeffrey Whitaker, Anthony Scopatz, Josh Moore +:Copyright: |copy| 2002, 2003, 2004 - Francesc Alted + + |copy| 2005, 2006, 2007 - Cárabos Coop. V. + + |copy| 2008, 2009, 2010 - Francesc Alted + + |copy| 2011–2021 - PyTables maintainers +:Date: |today| +:Version: |version| +:Home Page: http://www.pytables.org + +.. raw:: latex + + \clearpage + + +.. rubric:: Copyright Notice and Statement for PyTables User's Guide + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +a. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +b. Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. + +c. Neither the name of Francesc Alted nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND +CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, +BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND +FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT +NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF +THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +.. |copy| unicode:: U+000A9 .. COPYRIGHT SIGN + + +------------------------- +The PyTables Core Library +------------------------- + +.. toctree:: + :maxdepth: 1 + + introduction + installation + tutorials + libref + optimization + + +--------------------- +Complementary modules +--------------------- + + +.. toctree:: + :maxdepth: 1 + + filenode + + +---------- +Appendixes +---------- + +.. todo:: check why "latex_appendices" config option doesn't work with parts +.. todo:: try to use raw latex \appendix + +.. toctree:: + :maxdepth: 1 + + datatypes + condition_syntax + parameter_files + utilities + file_format + + +.. raw:: latex + + \bookmarksetup{startatroot} + \addtocontents{toc}{\bigskip} + + +.. toctree:: + :maxdepth: 1 + + bibliography diff --git a/doc/source/usersguide/utilities.rst b/doc/source/usersguide/utilities.rst new file mode 100644 index 0000000..be0b72e --- /dev/null +++ b/doc/source/usersguide/utilities.rst @@ -0,0 +1,499 @@ +Utilities +========= +PyTables comes with a couple of utilities that make the life easier to the +user. One is called ptdump and lets you see the contents of a PyTables file +(or generic HDF5 file, if supported). The other one is named ptrepack that +allows to (recursively) copy sub-hierarchies of objects present in a file +into another one, changing, if desired, some of the filters applied to the +leaves during the copy process. + +Normally, these utilities will be installed somewhere in your PATH during the +process of installation of the PyTables package, so that you can invoke them +from any place in your file system after the installation has successfully +finished. + + +ptdump +------ +As has been said before, ptdump utility allows you look into the contents of +your PyTables files. It lets you see not only the data but also the metadata +(that is, the *structure* and additional information in the form of +*attributes*). + +Usage +~~~~~ +For instructions on how to use it, just pass the -h flag to the command: + +.. code-block:: bash + + $ ptdump -h + +to see the message usage: + +.. code-block:: bash + + usage: ptdump [-h] [-v] [-d] [-a] [-s] [-c] [-i] [-R RANGE] + filename[:nodepath] + + The ptdump utility allows you look into the contents of your PyTables files. + It lets you see not only the data but also the metadata (that is, the + *structure* and additional information in the form of *attributes*). + + positional arguments: + filename[:nodepath] name of the HDF5 file to dump + + optional arguments: + -h, --help show this help message and exit + -v, --verbose dump more metainformation on nodes + -d, --dump dump data information on leaves + -a, --showattrs show attributes in nodes (only useful when -v or -d + are active) + -s, --sort sort output by node name + -c, --colinfo show info of columns in tables (only useful when -v or + -d are active) + -i, --idxinfo show info of indexed columns (only useful when -v or + -d are active) + -R RANGE, --range RANGE + select a RANGE of rows (in the form "start,stop,step") + during the copy of *all* the leaves. Default values + are "None,None,1", which means a copy of all the rows. + +Read on for a brief introduction to this utility. + + +A small tutorial on ptdump +~~~~~~~~~~~~~~~~~~~~~~~~~~ +Let's suppose that we want to know only the *structure* of a file. In order +to do that, just don't pass any flag, just the file as parameter. + +.. code-block:: bash + + $ ptdump vlarray1.h5 + / (RootGroup) '' + /vlarray1 (VLArray(3,), shuffle, zlib(1)) 'ragged array of ints' + /vlarray2 (VLArray(3,), shuffle, zlib(1)) 'ragged array of strings' + +we can see that the file contains just a leaf object called vlarray1, that is +an instance of VLArray, has 4 rows, and two filters has been used in order to +create it: shuffle and zlib (with a compression level of 1). + +Let's say we want more meta-information. Just add the -v (verbose) flag: + +.. code-block:: bash + + $ ptdump -v vlarray1.h5 + / (RootGroup) '' + /vlarray1 (VLArray(3,), shuffle, zlib(1)) 'ragged array of ints' + atom = Int32Atom(shape=(), dflt=0) + byteorder = 'little' + nrows = 3 + flavor = 'numpy' + /vlarray2 (VLArray(3,), shuffle, zlib(1)) 'ragged array of strings' + atom = StringAtom(itemsize=2, shape=(), dflt='') + byteorder = 'irrelevant' + nrows = 3 + flavor = 'python' + +so we can see more info about the atoms that are the components of the +vlarray1 dataset, i.e. they are scalars of type Int32 and with NumPy +*flavor*. + +If we want information about the attributes on the nodes, we must add the -a +flag: + +.. code-block:: bash + + $ ptdump -va vlarray1.h5 + / (RootGroup) '' + /._v_attrs (AttributeSet), 4 attributes: + [CLASS := 'GROUP', + PYTABLES_FORMAT_VERSION := '2.0', + TITLE := '', + VERSION := '1.0'] + /vlarray1 (VLArray(3,), shuffle, zlib(1)) 'ragged array of ints' + atom = Int32Atom(shape=(), dflt=0) + byteorder = 'little' + nrows = 3 + flavor = 'numpy' + /vlarray1._v_attrs (AttributeSet), 3 attributes: + [CLASS := 'VLARRAY', + TITLE := 'ragged array of ints', + VERSION := '1.3'] + /vlarray2 (VLArray(3,), shuffle, zlib(1)) 'ragged array of strings' + atom = StringAtom(itemsize=2, shape=(), dflt='') + byteorder = 'irrelevant' + nrows = 3 + flavor = 'python' + /vlarray2._v_attrs (AttributeSet), 4 attributes: + [CLASS := 'VLARRAY', + FLAVOR := 'python', + TITLE := 'ragged array of strings', + VERSION := '1.3'] + + +Let's have a look at the real data: + +.. code-block:: bash + + $ ptdump -d vlarray1.h5 + / (RootGroup) '' + /vlarray1 (VLArray(3,), shuffle, zlib(1)) 'ragged array of ints' + Data dump: + [0] [5 6] + [1] [5 6 7] + [2] [5 6 9 8] + /vlarray2 (VLArray(3,), shuffle, zlib(1)) 'ragged array of strings' + Data dump: + [0] ['5', '66'] + [1] ['5', '6', '77'] + [2] ['5', '6', '9', '88'] + +We see here a data dump of the 4 rows in vlarray1 object, in the form of a +list. Because the object is a VLA, we see a different number of integers on +each row. + +Say that we are interested only on a specific *row range* of the /vlarray1 +object: + +.. code-block:: bash + + ptdump -R2,3 -d vlarray1.h5:/vlarray1 + /vlarray1 (VLArray(3,), shuffle, zlib(1)) 'ragged array of ints' + Data dump: + [2] [5 6 9 8] + +Here, we have specified the range of rows between 2 and 4 (the upper limit +excluded, as usual in Python). See how we have selected only the /vlarray1 +object for doing the dump (vlarray1.h5:/vlarray1). + +Finally, you can mix several information at once: + +.. code-block:: bash + + $ ptdump -R2,3 -vad vlarray1.h5:/vlarray1 + /vlarray1 (VLArray(3,), shuffle, zlib(1)) 'ragged array of ints' + atom = Int32Atom(shape=(), dflt=0) + byteorder = 'little' + nrows = 3 + flavor = 'numpy' + /vlarray1._v_attrs (AttributeSet), 3 attributes: + [CLASS := 'VLARRAY', + TITLE := 'ragged array of ints', + VERSION := '1.3'] + Data dump: + [2] [5 6 9 8] + + +.. _ptrepackDescr: + +ptrepack +-------- +This utility is a very powerful one and lets you copy any leaf, group or +complete subtree into another file. During the copy process you are allowed +to change the filter properties if you want so. Also, in the case of +duplicated pathnames, you can decide if you want to overwrite already +existing nodes on the destination file. Generally speaking, ptrepack can be +useful in may situations, like replicating a subtree in another file, change +the filters in objects and see how affect this to the compression degree or +I/O performance, consolidating specific data in repositories or even +*importing* generic HDF5 files and create true PyTables counterparts. + + +Usage +~~~~~ +For instructions on how to use it, just pass the -h flag to the command: + +.. code-block:: bash + + $ ptrepack -h + +to see the message usage: + +.. code-block:: bash + + usage: ptrepack [-h] [-v] [-o] [-R RANGE] [--non-recursive] + [--dest-title TITLE] [--dont-create-sysattrs] + [--dont-copy-userattrs] [--overwrite-nodes] + [--complevel COMPLEVEL] + [--complib {zlib,lzo,bzip2,blosc,blosc:blosclz,blosc:lz4,blosc:lz4hc,blosc:snappy,blosc:zlib,blosc:zstd}] + [--shuffle {0,1}] [--bitshuffle {0,1}] [--fletcher32 {0,1}] + [--keep-source-filters] [--chunkshape CHUNKSHAPE] + [--upgrade-flavors] [--dont-regenerate-old-indexes] + [--sortby COLUMN] [--checkCSI] [--propindexes] + sourcefile:sourcegroup destfile:destgroup + + This utility is very powerful and lets you copy any leaf, group or complete + subtree into another file. During the copy process you are allowed to change + the filter properties if you want so. Also, in the case of duplicated + pathnames, you can decide if you want to overwrite already existing nodes on + the destination file. Generally speaking, ptrepack can be useful in may + situations, like replicating a subtree in another file, change the filters in + objects and see how affect this to the compression degree or I/O performance, + consolidating specific data in repositories or even *importing* generic HDF5 + files and create true PyTables counterparts. + + positional arguments: + sourcefile:sourcegroup + source file/group + destfile:destgroup destination file/group + + optional arguments: + -h, --help show this help message and exit + -v, --verbose show verbose information + -o, --overwrite overwrite destination file + -R RANGE, --range RANGE + select a RANGE of rows (in the form "start,stop,step") + during the copy of *all* the leaves. Default values + are "None,None,1", which means a copy of all the rows. + --non-recursive do not do a recursive copy. Default is to do it + --dest-title TITLE title for the new file (if not specified, the source + is copied) + --dont-create-sysattrs + do not create sys attrs (default is to do it) + --dont-copy-userattrs + do not copy the user attrs (default is to do it) + --overwrite-nodes overwrite destination nodes if they exist. Default is + to not overwrite them + --complevel COMPLEVEL + set a compression level (0 for no compression, which + is the default) + --complib {zlib,lzo,bzip2,blosc,blosc:blosclz,blosc:lz4,blosc:lz4hc,blosc:snappy,blosc:zlib,blosc:zstd} + set the compression library to be used during the + copy. Defaults to zlib + --shuffle {0,1} activate or not the shuffle filter (default is active + if complevel > 0) + --bitshuffle {0,1} activate or not the bitshuffle filter (not active by + default) + --fletcher32 {0,1} whether to activate or not the fletcher32 filter (not + active by default) + --keep-source-filters + use the original filters in source files. The default + is not doing that if any of --complevel, --complib, + --shuffle --bitshuffle or --fletcher32 option is + specified + --chunkshape CHUNKSHAPE + set a chunkshape. Possible options are: "keep" | + "auto" | int | tuple. A value of "auto" computes a + sensible value for the chunkshape of the leaves + copied. The default is to "keep" the original value + --upgrade-flavors when repacking PyTables 1.x or PyTables 2.x files, the + flavor of leaves will be unset. With this, such a + leaves will be serialized as objects with the internal + flavor ('numpy' for 3.x series) + --dont-regenerate-old-indexes + disable regenerating old indexes. The default is to + regenerate old indexes as they are found + --sortby COLUMN do a table copy sorted by the index in "column". For + reversing the order, use a negative value in the + "step" part of "RANGE" (see "-r" flag). Only applies + to table objects + --checkCSI force the check for a CSI index for the --sortby + column + --propindexes propagate the indexes existing in original tables. The + default is to not propagate them. Only applies to + table objects + --dont-allow-padding remove the possible padding in compound types in + source files. The default is to propagate it. Only + applies to table objects + + +Read on for a brief introduction to this utility. + +A small tutorial on ptrepack +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Imagine that we have ended the tutorial 1 (see the output of +examples/tutorial1-1.py), and we want to copy our reduced data (i.e. those +datasets that hangs from the /column group) to another file. First, let's +remember the content of the examples/tutorial1.h5: + +.. code-block:: bash + + $ ptdump tutorial1.h5 + / (RootGroup) 'Test file' + /columns (Group) 'Pressure and Name' + /columns/name (Array(3,)) 'Name column selection' + /columns/pressure (Array(3,)) 'Pressure column selection' + /detector (Group) 'Detector information' + /detector/readout (Table(10,)) 'Readout example' + +Now, copy the /columns to other non-existing file. That's easy: + +.. code-block:: bash + + $ ptrepack tutorial1.h5:/columns reduced.h5 + +That's all. Let's see the contents of the newly created reduced.h5 file: + +.. code-block:: bash + + $ ptdump reduced.h5 + / (RootGroup) '' + /name (Array(3,)) 'Name column selection' + /pressure (Array(3,)) 'Pressure column selection' + +so, you have copied the children of /columns group into the *root* of the +reduced.h5 file. + +Now, you suddenly realized that what you intended to do was to copy all the +hierarchy, the group /columns itself included. You can do that by just +specifying the destination group: + +.. code-block:: bash + + $ ptrepack tutorial1.h5:/columns reduced.h5:/columns + $ ptdump reduced.h5 + / (RootGroup) '' + /name (Array(3,)) 'Name column selection' + /pressure (Array(3,)) 'Pressure column selection' + /columns (Group) '' + /columns/name (Array(3,)) 'Name column selection' + /columns/pressure (Array(3,)) 'Pressure column selection' + +OK. Much better. But you want to get rid of the existing nodes on the new +file. You can achieve this by adding the -o flag: + +.. code-block:: bash + + $ ptrepack -o tutorial1.h5:/columns reduced.h5:/columns + $ ptdump reduced.h5 + / (RootGroup) '' + /columns (Group) '' + /columns/name (Array(3,)) 'Name column selection' + /columns/pressure (Array(3,)) 'Pressure column selection' + +where you can see how the old contents of the reduced.h5 file has been +overwritten. + +You can copy just one single node in the repacking operation and change its +name in destination: + +.. code-block:: bash + + $ ptrepack tutorial1.h5:/detector/readout reduced.h5:/rawdata + $ ptdump reduced.h5 + / (RootGroup) '' + /rawdata (Table(10,)) 'Readout example' + /columns (Group) '' + /columns/name (Array(3,)) 'Name column selection' + /columns/pressure (Array(3,)) 'Pressure column selection' + +where the /detector/readout has been copied to /rawdata in destination. + +We can change the filter properties as well: + +.. code-block:: bash + + $ ptrepack --complevel=1 tutorial1.h5:/detector/readout reduced.h5:/rawdata + Problems doing the copy from 'tutorial1.h5:/detector/readout' to 'reduced.h5:/rawdata' + The error was --> tables.exceptions.NodeError: destination group \``/\`` already has a node named \``rawdata``; you may want to use the \``overwrite`` argument + The destination file looks like: + / (RootGroup) '' + /rawdata (Table(10,)) 'Readout example' + /columns (Group) '' + /columns/name (Array(3,)) 'Name column selection' + /columns/pressure (Array(3,)) 'Pressure column selection' + Traceback (most recent call last): + File "utils/ptrepack", line 3, in ? + main() + File ".../tables/scripts/ptrepack.py", line 349, in main + stats = stats, start = start, stop = stop, step = step) + File ".../tables/scripts/ptrepack.py", line 107, in copy_leaf + raise RuntimeError, "Please check that the node names are not + duplicated in destination, and if so, add the --overwrite-nodes flag + if desired." + RuntimeError: Please check that the node names are not duplicated in + destination, and if so, add the --overwrite-nodes flag if desired. + +Ooops! We ran into problems: we forgot that the /rawdata pathname already +existed in destination file. Let's add the --overwrite-nodes, as the verbose +error suggested: + +.. code-block:: bash + + $ ptrepack --overwrite-nodes --complevel=1 tutorial1.h5:/detector/readout + reduced.h5:/rawdata + $ ptdump reduced.h5 + / (RootGroup) '' + /rawdata (Table(10,), shuffle, zlib(1)) 'Readout example' + /columns (Group) '' + /columns/name (Array(3,)) 'Name column selection' + /columns/pressure (Array(3,)) 'Pressure column selection' + +you can check how the filter properties has been changed for the /rawdata +table. Check as the other nodes still exists. + +Finally, let's copy a *slice* of the readout table in origin to destination, +under a new group called /slices and with the name, for example, aslice: + +.. code-block:: bash + + $ ptrepack -R1,8,3 tutorial1.h5:/detector/readout reduced.h5:/slices/aslice + $ ptdump reduced.h5 + / (RootGroup) '' + /rawdata (Table(10,), shuffle, zlib(1)) 'Readout example' + /columns (Group) '' + /columns/name (Array(3,)) 'Name column selection' + /columns/pressure (Array(3,)) 'Pressure column selection' + /slices (Group) '' + /slices/aslice (Table(3,)) 'Readout example' + +note how only 3 rows of the original readout table has been copied to the new +aslice destination. Note as well how the previously nonexistent slices group +has been created in the same operation. + + + +pt2to3 +------ + +The PyTables 3.x series now follows `PEP 8`_ coding standard. This makes +using PyTables more idiomatic with surrounding Python code that also adheres +to this standard. The primary way that the 2.x series was *not* PEP 8 +compliant was with respect to variable naming conventions. Approximately 450 +API variables were identified and updated for PyTables 3.x. + +To ease migration, PyTables ships with a new ``pt2to3`` command line tool. +This tool will run over a file and replace any instances of the old variable +names with the 3.x version of the name. This tool covers the overwhelming +majority of cases was used to transition the PyTables code base itself! However, +it may also accidentally also pick up variable names in 3rd party codes that +have *exactly* the same name as a PyTables' variable. This is because ``pt2to3`` +was implemented using regular expressions rather than a fancier AST-based +method. By using regexes, ``pt2to3`` works on Python and Cython code. + + +``pt2to3`` **help:** + +.. code-block:: bash + + usage: pt2to3 [-h] [-r] [-p] [-o OUTPUT] [-i] filename + + PyTables 2.x -> 3.x API transition tool This tool displays to standard out, so + it is common to pipe this to another file: $ pt2to3 oldfile.py > newfile.py + + positional arguments: + filename path to input file. + + optional arguments: + -h, --help show this help message and exit + -r, --reverse reverts changes, going from 3.x -> 2.x. + -p, --no-ignore-previous + ignores previous_api() calls. + -o OUTPUT output file to write to. + -i, --inplace overwrites the file in-place. + +Note that ``pt2to3`` only works on a single file, not a a directory. However, +a simple BASH script may be written to run ``pt2to3`` over an entire directory +and all sub-directories: + +.. code-block:: bash + + #!/bin/bash + for f in $(find .) + do + echo $f + pt2to3 $f > temp.txt + mv temp.txt $f + done + +.. _PEP 8: http://www.python.org/dev/peps/pep-0008/ diff --git a/environment.yml b/environment.yml new file mode 100644 index 0000000..0128a88 --- /dev/null +++ b/environment.yml @@ -0,0 +1,3 @@ +name: PyTables +dependencies: + - hdf5 diff --git a/examples/Single_Table-vs-EArray_Table.ipynb b/examples/Single_Table-vs-EArray_Table.ipynb new file mode 100644 index 0000000..12b9c65 --- /dev/null +++ b/examples/Single_Table-vs-EArray_Table.ipynb @@ -0,0 +1,846 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Using a single Table vs EArray + Table" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The PyTables community keep asking what can be considered a FAQ. Namely, should I use a single Table for storing my data, or should I split it in a Table and an Array?\n", + "\n", + "Although there is not a totally general answer, the study below address this for the common case where one has 'raw data' and other data that can be considered 'meta'. See for example: https://groups.google.com/forum/#!topic/pytables-users/vBEiaRzp3gI" + ] + }, + { + "cell_type": "code", + "execution_count": 156, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=\n", + "PyTables version: 3.2.4.dev0\n", + "HDF5 version: 1.8.16\n", + "NumPy version: 1.11.0\n", + "Numexpr version: 2.4.3 (not using Intel's VML/MKL)\n", + "Zlib version: 1.2.8 (in Python interpreter)\n", + "Blosc version: 1.9.2 (2016-06-08)\n", + "Blosc compressors: blosclz (1.0.5), lz4 (1.7.2), lz4hc (1.7.2), snappy (1.1.1), zlib (1.2.8)\n", + "Blosc filters: shuffle, bitshuffle\n", + "Cython version: 0.23.4\n", + "Python version: 2.7.12 (default, Jul 1 2016, 15:12:24) \n", + "[GCC 5.4.0 20160609]\n", + "Platform: Linux-4.6.4-gentoo-x86_64-with-Ubuntu-16.04-xenial\n", + "Byte-ordering: little\n", + "Detected cores: 8\n", + "Default encoding: ascii\n", + "Default FS encoding: UTF-8\n", + "Default locale: (en_US, UTF-8)\n", + "-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=\n" + ] + } + ], + "source": [ + "import numpy as np\n", + "import tables\n", + "tables.print_versions()" + ] + }, + { + "cell_type": "code", + "execution_count": 157, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "LEN_PMT = int(1.2e6)\n", + "NPMTS = 12\n", + "NEVENTS = 10" + ] + }, + { + "cell_type": "code", + "execution_count": 158, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "!rm PMT*.h5" + ] + }, + { + "cell_type": "code", + "execution_count": 159, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "def gaussian(x, mu, sig):\n", + " return np.exp(-np.power(x - mu, 2.) / (2 * np.power(sig, 2.)))\n", + "\n", + "x = np.linspace(0, 1, 1e7)\n", + "rd = (gaussian(x, 1, 1.) * 1e6).astype(np.int32)\n", + "\n", + "def raw_data(length):\n", + " # Return the actual data that you think it represents PM waveforms better \n", + " #return np.arange(length, dtype=np.int32)\n", + " return rd[:length]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Using Tables to store everything" + ] + }, + { + "cell_type": "code", + "execution_count": 160, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "class PMTRD(tables.IsDescription):\n", + " # event_id = tables.Int32Col(pos=1, indexed=True) \n", + " event_id = tables.Int32Col(pos=1)\n", + " npmt = tables.Int8Col(pos=2)\n", + " pmtrd = tables.Int32Col(shape=LEN_PMT, pos=3) " + ] + }, + { + "cell_type": "code", + "execution_count": 161, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "def one_table(filename, filters):\n", + " with tables.open_file(\"{}-{}-{}.h5\".format(filename, filters.complib, filters.complevel), \"w\", filters=filters) as h5t:\n", + " pmt = h5t.create_table(h5t.root, \"pmt\", PMTRD, expectedrows=NEVENTS*NPMTS)\n", + " pmtr = pmt.row\n", + " for i in range(NEVENTS):\n", + " for j in range(NPMTS):\n", + " pmtr['event_id'] = i\n", + " pmtr['npmt'] = j\n", + " pmtr['pmtrd'] = raw_data(LEN_PMT)\n", + " pmtr.append()" + ] + }, + { + "cell_type": "code", + "execution_count": 162, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 220 ms, sys: 104 ms, total: 324 ms\n", + "Wall time: 323 ms\n" + ] + } + ], + "source": [ + "# Using no compression\n", + "%time one_table(\"PMTs\", tables.Filters(complib=\"zlib\", complevel=0))" + ] + }, + { + "cell_type": "code", + "execution_count": 163, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 3.16 s, sys: 16 ms, total: 3.17 s\n", + "Wall time: 3.17 s\n" + ] + } + ], + "source": [ + "# Using Zlib (level 5) compression\n", + "%time one_table(\"PMTs\", tables.Filters(complib=\"zlib\", complevel=5))" + ] + }, + { + "cell_type": "code", + "execution_count": 164, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 596 ms, sys: 4 ms, total: 600 ms\n", + "Wall time: 599 ms\n" + ] + } + ], + "source": [ + "# Using Blosc (level 9) compression\n", + "%time one_table(\"PMTs\", tables.Filters(complib=\"blosc:lz4\", complevel=9))" + ] + }, + { + "cell_type": "code", + "execution_count": 165, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 42M PMTs-blosc:lz4-9.h5 550M PMTs-None-0.h5 17M PMTs-zlib-5.h5\r\n" + ] + } + ], + "source": [ + "ls -sh *.h5" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "So, using no compression leads to best speed, whereas Zlib can compress data by ~32x. Zlib is ~3x slower than using no compression though. On its hand, the Blosc compressor is faster but it can barely compress the dataset." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Using EArrays for storing raw data and Table for other metadata" + ] + }, + { + "cell_type": "code", + "execution_count": 166, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "def rawdata_earray(filename, filters):\n", + " with tables.open_file(\"{}-{}.h5\".format(filename, filters.complib), \"w\", filters=filters) as h5a:\n", + " pmtrd = h5a.create_earray(h5a.root, \"pmtrd\", tables.Int32Atom(), shape=(0, NPMTS, LEN_PMT),\n", + " chunkshape=(1,1,LEN_PMT))\n", + " for i in range(NEVENTS):\n", + " rdata = []\n", + " for j in range(NPMTS):\n", + " rdata.append(raw_data(LEN_PMT))\n", + " pmtrd.append(np.array(rdata).reshape(1, NPMTS, LEN_PMT))\n", + " pmtrd.flush()" + ] + }, + { + "cell_type": "code", + "execution_count": 167, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 120 ms, sys: 132 ms, total: 252 ms\n", + "Wall time: 250 ms\n" + ] + } + ], + "source": [ + "# Using no compression\n", + "%time rawdata_earray(\"PMTAs\", tables.Filters(complib=\"zlib\", complevel=0))" + ] + }, + { + "cell_type": "code", + "execution_count": 168, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 2.72 s, sys: 24 ms, total: 2.74 s\n", + "Wall time: 2.74 s\n" + ] + } + ], + "source": [ + "# Using Zlib (level 5) compression\n", + "%time rawdata_earray(\"PMTAs\", tables.Filters(complib=\"zlib\", complevel=5))" + ] + }, + { + "cell_type": "code", + "execution_count": 169, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 224 ms, sys: 36 ms, total: 260 ms\n", + "Wall time: 258 ms\n" + ] + } + ], + "source": [ + "# Using Blosc (level 5) compression\n", + "%time rawdata_earray(\"PMTAs\", tables.Filters(complib=\"blosc:lz4\", complevel=9))" + ] + }, + { + "cell_type": "code", + "execution_count": 170, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "9.0M PMTAs-blosc:lz4.h5 4.1M PMTAs-zlib.h5\t 550M PMTs-None-0.h5\r\n", + "550M PMTAs-None.h5\t 42M PMTs-blosc:lz4-9.h5 17M PMTs-zlib-5.h5\r\n" + ] + } + ], + "source": [ + "!ls -sh *.h5" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": true + }, + "source": [ + "We see that by using the Blosc compressor one can achieve around 10x faster output operation wrt Zlib, although the compression ratio can be somewhat smaller (but still pretty good)." + ] + }, + { + "cell_type": "code", + "execution_count": 171, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# Add the event IDs in a separate table in the same file\n", + "class PMTRD(tables.IsDescription):\n", + " # event_id = tables.Int32Col(pos=1, indexed=True) \n", + " event_id = tables.Int32Col(pos=1)\n", + " npmt = tables.Int8Col(pos=2)\n", + "\n", + "def add_table(filename, filters):\n", + " with tables.open_file(\"{}-{}.h5\".format(filename, filters.complib), \"a\", filters=filters) as h5a:\n", + " pmt = h5a.create_table(h5a.root, \"pmt\", PMTRD)\n", + " pmtr = pmt.row\n", + " for i in range(NEVENTS):\n", + " for j in range(NPMTS):\n", + " pmtr['event_id'] = i\n", + " pmtr['npmt'] = j\n", + " pmtr.append()" + ] + }, + { + "cell_type": "code", + "execution_count": 172, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 0 ns, sys: 0 ns, total: 0 ns\n", + "Wall time: 1.79 ms\n" + ] + } + ], + "source": [ + "# Using no compression\n", + "%time add_table(\"PMTAs\", tables.Filters(complib=\"zlib\", complevel=0))" + ] + }, + { + "cell_type": "code", + "execution_count": 173, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 0 ns, sys: 0 ns, total: 0 ns\n", + "Wall time: 1.74 ms\n" + ] + } + ], + "source": [ + "# Using Zlib (level 5) compression\n", + "%time add_table(\"PMTAs\", tables.Filters(complib=\"zlib\", complevel=5))" + ] + }, + { + "cell_type": "code", + "execution_count": 174, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 4 ms, sys: 0 ns, total: 4 ms\n", + "Wall time: 1.52 ms\n" + ] + } + ], + "source": [ + "# Using Blosc (level 9) compression\n", + "%time add_table(\"PMTAs\", tables.Filters(complib=\"blosc:lz4\", complevel=9))" + ] + }, + { + "cell_type": "code", + "execution_count": 175, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "9.0M PMTAs-blosc:lz4.h5 4.1M PMTAs-zlib.h5\t 550M PMTs-None-0.h5\r\n", + "550M PMTAs-None.h5\t 42M PMTs-blosc:lz4-9.h5 17M PMTs-zlib-5.h5\r\n" + ] + } + ], + "source": [ + "!ls -sh *.h5" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": true + }, + "source": [ + "After adding the table we continue to see that a better compression ratio is achieved for EArray + Table with respect to a single Table. Also, Blosc can make writing files significantly faster than not using compression (it has to write less)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Retrieving data from a single Table" + ] + }, + { + "cell_type": "code", + "execution_count": 187, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "def read_single_table(complib, complevel):\n", + " with tables.open_file(\"PMTs-{}-{}.h5\".format(complib, complevel), \"r\") as h5t:\n", + " pmt = h5t.root.pmt\n", + " for i, row in enumerate(pmt):\n", + " event_id, npmt, pmtrd = row[\"event_id\"], row[\"npmt\"], row[\"pmtrd\"][:]\n", + " if i % 20 == 0:\n", + " print(event_id, npmt, pmtrd[0:5])" + ] + }, + { + "cell_type": "code", + "execution_count": 177, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(0, 0, array([606530, 606530, 606530, 606530, 606530], dtype=int32))\n", + "(1, 8, array([606530, 606530, 606530, 606530, 606530], dtype=int32))\n", + "(3, 4, array([606530, 606530, 606530, 606530, 606530], dtype=int32))\n", + "(5, 0, array([606530, 606530, 606530, 606530, 606530], dtype=int32))\n", + "(6, 8, array([606530, 606530, 606530, 606530, 606530], dtype=int32))\n", + "(8, 4, array([606530, 606530, 606530, 606530, 606530], dtype=int32))\n", + "CPU times: user 24 ms, sys: 80 ms, total: 104 ms\n", + "Wall time: 99 ms\n" + ] + } + ], + "source": [ + "%time read_single_table(\"None\", 0)" + ] + }, + { + "cell_type": "code", + "execution_count": 178, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(0, 0, array([606530, 606530, 606530, 606530, 606530], dtype=int32))\n", + "(1, 8, array([606530, 606530, 606530, 606530, 606530], dtype=int32))\n", + "(3, 4, array([606530, 606530, 606530, 606530, 606530], dtype=int32))\n", + "(5, 0, array([606530, 606530, 606530, 606530, 606530], dtype=int32))\n", + "(6, 8, array([606530, 606530, 606530, 606530, 606530], dtype=int32))\n", + "(8, 4, array([606530, 606530, 606530, 606530, 606530], dtype=int32))\n", + "CPU times: user 576 ms, sys: 24 ms, total: 600 ms\n", + "Wall time: 593 ms\n" + ] + } + ], + "source": [ + "%time read_single_table(\"zlib\", 5)" + ] + }, + { + "cell_type": "code", + "execution_count": 179, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(0, 0, array([606530, 606530, 606530, 606530, 606530], dtype=int32))\n", + "(1, 8, array([606530, 606530, 606530, 606530, 606530], dtype=int32))\n", + "(3, 4, array([606530, 606530, 606530, 606530, 606530], dtype=int32))\n", + "(5, 0, array([606530, 606530, 606530, 606530, 606530], dtype=int32))\n", + "(6, 8, array([606530, 606530, 606530, 606530, 606530], dtype=int32))\n", + "(8, 4, array([606530, 606530, 606530, 606530, 606530], dtype=int32))\n", + "CPU times: user 776 ms, sys: 16 ms, total: 792 ms\n", + "Wall time: 782 ms\n" + ] + } + ], + "source": [ + "%time read_single_table(\"blosc:lz4\", 9)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As Blosc could not compress the table, it has a performance that is worse (quite worse actually) to the uncompressed table. On its hand, Zlib can be more than 3x slower for reading than without compression." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Retrieving data from the EArray + Table" + ] + }, + { + "cell_type": "code", + "execution_count": 180, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "def read_earray_table(complib, complevel):\n", + " with tables.open_file(\"PMTAs-{}.h5\".format(complib, \"r\")) as h5a:\n", + " pmt = h5a.root.pmt\n", + " pmtrd_ = h5a.root.pmtrd\n", + " for i, row in enumerate(pmt):\n", + " event_id, npmt = row[\"event_id\"], row[\"npmt\"]\n", + " pmtrd = pmtrd_[event_id, npmt]\n", + " if i % 20 == 0:\n", + " print(event_id, npmt, pmtrd[0:5])" + ] + }, + { + "cell_type": "code", + "execution_count": 181, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(0, 0, array([606530, 606530, 606530, 606530, 606530], dtype=int32))\n", + "(1, 8, array([606530, 606530, 606530, 606530, 606530], dtype=int32))\n", + "(3, 4, array([606530, 606530, 606530, 606530, 606530], dtype=int32))\n", + "(5, 0, array([606530, 606530, 606530, 606530, 606530], dtype=int32))\n", + "(6, 8, array([606530, 606530, 606530, 606530, 606530], dtype=int32))\n", + "(8, 4, array([606530, 606530, 606530, 606530, 606530], dtype=int32))\n", + "CPU times: user 4 ms, sys: 80 ms, total: 84 ms\n", + "Wall time: 82 ms\n" + ] + } + ], + "source": [ + "%time read_earray_table(\"None\", 0)" + ] + }, + { + "cell_type": "code", + "execution_count": 182, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(0, 0, array([606530, 606530, 606530, 606530, 606530], dtype=int32))\n", + "(1, 8, array([606530, 606530, 606530, 606530, 606530], dtype=int32))\n", + "(3, 4, array([606530, 606530, 606530, 606530, 606530], dtype=int32))\n", + "(5, 0, array([606530, 606530, 606530, 606530, 606530], dtype=int32))\n", + "(6, 8, array([606530, 606530, 606530, 606530, 606530], dtype=int32))\n", + "(8, 4, array([606530, 606530, 606530, 606530, 606530], dtype=int32))\n", + "CPU times: user 992 ms, sys: 104 ms, total: 1.1 s\n", + "Wall time: 1.09 s\n" + ] + } + ], + "source": [ + "%time read_earray_table(\"zlib\", 5)" + ] + }, + { + "cell_type": "code", + "execution_count": 183, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(0, 0, array([606530, 606530, 606530, 606530, 606530], dtype=int32))\n", + "(1, 8, array([606530, 606530, 606530, 606530, 606530], dtype=int32))\n", + "(3, 4, array([606530, 606530, 606530, 606530, 606530], dtype=int32))\n", + "(5, 0, array([606530, 606530, 606530, 606530, 606530], dtype=int32))\n", + "(6, 8, array([606530, 606530, 606530, 606530, 606530], dtype=int32))\n", + "(8, 4, array([606530, 606530, 606530, 606530, 606530], dtype=int32))\n", + "CPU times: user 168 ms, sys: 4 ms, total: 172 ms\n", + "Wall time: 171 ms\n" + ] + } + ], + "source": [ + "%time read_earray_table(\"blosc:lz4\", 9)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "So, the EArray + Table takes a similar time to read than a pure Table approach when no compression is used. And for some reason, when Zlib is used for compressing the data, the EArray + Table scenario degrades read speed significantly. However, when the Blosc compression is used, the EArray + Table works actually faster than for the single Table. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Some plots on speeds and sizes" + ] + }, + { + "cell_type": "code", + "execution_count": 184, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "import matplotlib\n", + "import matplotlib.pyplot as plt\n", + "%matplotlib inline" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's have a look at the speeds at which data can be stored and read using the different paradigms:" + ] + }, + { + "cell_type": "code", + "execution_count": 188, + "metadata": { + "collapsed": false, + "scrolled": false + }, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAe8AAAGxCAYAAABLDT5KAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzs3XucVXW9//HXe7h5YQaYER0QGBSlvPyM1LyUFwbvKYql\nJuQtLU3jB5bHvKQC2eF4tGOaxwx+CUEkpWiiYeJRwaJOXjA08oKR3ERRGDBA5fr5/bEX02YuzIXZ\n7L2G9/PxWA/W5bu++7NnNvPZ3+/6rvVVRGBmZmbpUZTvAMzMzKxpnLzNzMxSxsnbzMwsZZy8zczM\nUsbJ28zMLGWcvM3MzFLGydss5STNkHRpvuPY0XbW920GTt5mzSbpGEl/lLRK0nJJf5B0WL7jyiZp\nhKSJOaj3eEmLW7reXJH0tqQB+Y7DrKW0zXcAZmkkqRh4HLgCeAhoDxwLrMtnXDuQgGY/4UlSm4jY\n1ILxmO1U3PI2a56+QETEg5GxLiKejoi5AJIuljRL0j1Jy/y17JafpBJJP5O0VNJiSbdKUtbxS5Nz\nVkj6naReWcdOkvS6pJWS7iGTSGuRdApwI/AVSasl/SXZ303S1KTueZK+Xt+blPRFSX+T9M8kzu9I\n2g14Auie1PtPSeWS2ku6S9I7kpZI+pGkdkk9xyfnf1fSu8C4ZP8Zkv6SvJdZkv7PNmKp931L2lfS\nM0kPyPuSJkkqSY5NBHoBjyex/luy/0FJ7yb1zZR0YH2vbVZonLzNmmcesEnSzyWdKqlzHWWOBN4C\nyoCRwCNZ5SYA64F9gc8CJwFfB5B0FnA9MAjoCvwBmJwc2wN4mExS3gOYD3yhrgAjYjowGvh1RBRH\nxGeTQ78GFgHlwLnAaEn963mfPwO+ERElwMHAsxHxEXAasDSptyQi3gNuAo4ADgE+k6zflFVXOdCZ\nTCK9XNJngfuBbwClwBjgsS0JP5uksgbet5L3Wg4cAPQg8zMnIi5K3u8ZSaw/TM55AugD7Am8DPyy\nnp+BWeGJCC9evDRjAT5FpgW5iEwingp0TY5dDCypUf554KtkksUnQIesY+cDzyTrTwBfyzpWBKwF\negIXAn+qUe9i4NJ6YhwBTMza7gFsAHbL2jcaGFfP+QvIJNfiGvuPBxbV2Pd34JSs7ZOBf2SV/wRo\nl3X8J8CoGnW8ARxbRxxNfd9nAbOztt8GBmzjd9kZ2FzzfXrxUqiLW95mzRQRb0bEpRHRi0yrtDtw\nV1aRd2qcsjApUwG0A96VVCVpJfBTMq1skuN3J8eqgBVkri/vnZxfc6BYUwaOdQeqItN6zo5r73rK\nfxk4HViYjO4+qoG6F9Wot3vW9gcRsSFruwK4Zsv7TH4OPWqck113ve9b0p6SJifd9auASWRa6HWS\nVCTpNkl/T8q/TeZnXO85ZoXEydusBUTEPODnZJL4FjUTYi9gKZmk8wlQFhGlEdElIjpHxCFJuUXA\nFcmxLcc7RsSfgXeTerL13FZoNbaXAqWSdq8RV80vGlve1+yI2NJ9PxV4sJ56SeqoyNquSF6vvlgW\nA/9ex/v8dR11N/S+R5NpOR8UEZ2BC9h6LEDN1x4CDCTTGu8M9E7K1zl+wKzQOHmbNYOkTyWDt/ZO\ntnsCg4H/zSq2p6T/K6mtpHOBTwNPROb68FPAjyQVK2NfSccl540BbtwygEpSJ0nnJMemAQdKGiSp\njaThwF7bCHUZ0HvLYLiIWAL8CfgPSR0kHQJcBvyijvfYTtIQSSWRGRm+GtiUVW/ZlkFhiV8BN0na\nI7k2f3Nd9Wb5f8A3JR2RvN7uyQC53esoW9f7Ls86XgysAVYnv5Nra5z/HpnxBdnl1wErk9f7D7Zj\n9LzZjubkbdY8q8kMSHte0moyCfFV4N+yyjwP7A8sB24FvhwRK5NjF5G5vew1oIrM7WblABHxKHAb\n8KukS/dV4NTk2Aoyg8z+M6m3D/DHbcT5EJnW5ApJLyX7hgD7kGkVPwzcHBEz6jn/QuDtJI7LyVyz\nJyLeJDOI7h9Jl3c58APgpSTeV5L1f68vsIiYTeZ6+n8nlwfmkRkrUFfZut73rKwio4DDgFVkbuF7\nuEYVtwE3J7F+h8yAwUVkegvmkvn9maWGInL3ZVNSD2AimZbBZuD/RcSPa5Q5nkx33D+SXY9ExA9y\nFpTZDiDpYuCyiDiuwcJmZk2U64e0bAS+ExFzJHUEZkt6KiLeqFHu9xFxZo5jMTMzaxVy2m0eEe9F\nxJxkfQ3wOnWPavUgETMzs0bKabf5Vi8k9QZmAgcniXzL/uOBKcASMtfgro2I13ZIUGZmZim0Q55t\nnnSZTwGGZyfuxGygIiI+knQa8CiZR0/WrMMjQc3MbKcTEbV6p3Pe8pbUFvgt8LuIuLsR5d8GDouI\nqhr7Y0f1ErSUkSNHMnLkyHyHYbZd/Dm21iKNn2VJdSbvHXGr2DjgtfoSt6S9staPIPOFoqqusmZm\nZpbjbnNJXyBzX+hflZnRKMhMLFBBZkamscA5kq4k87zlj4Gv5DImMzOztMtp8o6IPwJtGihzL3Bv\nLuPIl/79++c7BLPt5s+xtRat6bO8w0abb680XvM2MzPbHvVd894ho83N0q53794sXLgw32FYHlRU\nVLBgwYJ8h2G2Fbe8zRoh+fab7zAsD/y7t3zK52hzMzMza0FO3mZmZinj5G1mZpYyTt5mZmYp4+Rt\n1oo98MADnHrqqS1SV2VlJePGjWuRuhpr+vTp7L///vUeHzx4MKNHj96BEZkVBidvs2bqXV6OpJwt\nvcvLGxXHrFmz+MIXvkDnzp3ZY489OPbYY5k9ezYAQ4YM4cknn8zlj6HB2IqLiykpKaFjx44UFRVR\nUlJSvW/JkiUN1iF5xmCzmnyft1kzLVy2jFzeQKRlyxoss3r1agYOHMiYMWM499xzWb9+PX/4wx/o\n0KFDDiNrvGOOOYbVq1cDsHDhQvbdd18+/PBDJ2Sz7eSWt1mKzZs3D0mcd955SKJDhw6ceOKJHHzw\nwQBMmDCBY489trp8UVERY8aMoW/fvpSWljJ06NDqY5s3b+aaa66ha9eu9OnTh3vvvZeioiI2b95c\n52uPGzeOAw88kLKyMk477TQWLVrUqJhr3jM9duxYDjjgAEpKSujbty/jx4+vVX7kyJGUlZXRp08f\npkyZUm/dv/nNb/jMZz5Dly5dOP7443n99dcbFZNZ2jh5m6VY3759adOmDZdccglPPvkkq1atqlWm\nZit32rRpzJ49m1deeYUHH3yQp556Csgk0enTp/Pqq6/y8ssv8+ijj9bbQp46dSq33XYbjz76KB98\n8AHHHnssgwcPbtZ76N69O9OnT+ef//wnP/3pT/nWt761VdJdsGABGzduZNmyZYwdO5aLL764zqfd\n/fnPf2bo0KFMmDCBqqoqLrzwQgYNGlTvlw+zNHPyNkux4uJiZs2aRVFREZdffjl77rknZ511Fh98\n8EG959xwww0UFxfTs2dPKisrmTNnDgAPPfQQw4cPp1u3bnTq1Inrr7++3jrGjBnDDTfcQN++fSkq\nKuL6669nzpw5LF68uMnv4YwzzqBXr14ADBgwgOOPP55Zs2ZVH2/Xrh233HILbdu25YQTTuDEE0+s\ns/U9duxYhg4dSr9+/ZDE17/+ddatW1d9/d+sNXHyNku5T33qU4wbN45FixYxd+5cli5dytVXX11v\n+b322qt6fbfddmPNmjUALF26lJ49e1Yfy16vaeHChQwfPpzS0lJKS0spKytDEu+8806T43/sscc4\n8sgjKSsro0uXLsyYMYPly5dXH+/atSvt27ev3q6oqGDp0qV1xjR69OjqmLp06cLy5cubFZNZoXPy\nNmtF+vbtyyWXXMLcuXObfG63bt22Gv29rWvYPXv2ZMyYMVRVVVFVVcXKlStZs2YNRx11VJNe86OP\nPuK8885jxIgRLF++nJUrV1JZWbnVdfHly5ezfv36reLq3r17nTF9//vfrxXToEGDmhSTWRo4eZul\n2Jtvvsmdd95Z3bpcvHgxkydP5uijj25yXeeddx533303S5cuZdWqVdx+++31lv3mN7/J6NGjee21\n1wD48MMPtzmQbIuag9U+/vhjNm7cSNeuXYFMK3zmzJlblVm/fj233norGzZs4Nlnn+Xpp5/mnHPO\nqVX35Zdfzj333FPdTb5mzRoef/xxPvnkkwbjMksb3ypm1kwVe+3VqNu5tqf+hhQXF/P8889z5513\n8uGHH9K5c2cGDhxYb+KtOQAte/sb3/gGb731FocccgidOnVi2LBhPPfccxQVFdUqO2jQINauXcv5\n55/PokWL6NSpEyeddFKdSXVbr19WVsYPf/hDzjjjDDZu3MjZZ5/N6aefvlWZffbZh7Zt21JeXk6n\nTp34+c9/TkVFRa36Pv/5z/PjH/+YK664gvnz57P77rtz3HHHcfLJJ28zJrM08pSgOdSrVzmLF+fu\nj3tL69lzLxYtei/fYRSknXFayCeffJIrr7ySt99+O9+h5NXO+Lu3wlHflKBueefQ4sXLmDEj31E0\nXmVler5oWMv75JNPmDFjBieffDLvvfceo0aN4ktf+lK+wzKzOviat5kBmevRI0aMoLS0lMMOO4yD\nDjqIUaNG5TssM6uDW95mBsCuu+7KCy+8kO8wzKwRnLzNzGynkLZxSNvi5G1mZjuFtI1DAqisrHu/\nr3mbmZmljJO3mZlZyjh5m5mZpYyTt5ml3uDBgxk9enSdx9atW0dRUVGdk5mYpVVOk7ekHpKelfQ3\nSX+VNKyecj+W9JakOZL65TIms5bSq1c5knK29OpV3uhYevfuzW677UZJSQnFxcWUlJQwbNi//rvN\nnDmToqIi7rjjjlz8KLbblVdeWR13hw4daN++PSUlJZSUlNR6XGpz1DcvuVla5Xq0+UbgOxExR1JH\nYLakpyLijS0FJJ0G9ImI/SUdCfwUaNrURGZ5kOuRq0154p0kpk2bRmU9Q1MnTpxIWVkZEydO5Npr\nr623nk2bNtGmTZsG9zXXwoUL6d+/f61Hrt53333cd999AIwaNYr58+czceLEFnlNqD0hilna5bTl\nHRHvRcScZH0N8Dqwd41iZwETkzLPA50kNTwjg5ltpb4E9dFHHzFlyhTuvfde3nrrLV5++eXqYwsX\nLqSoqIhx48ZRUVHBCSecUOc+yMw61q1bN7p06UL//v2rZxR76aWXKC8v3+r1H3nkEfr1q7sTrTmt\n4E2bNnHOOedQXl5OaWkpJ5xwAvPmzduqzHvvvceAAQMoKSnhpJNOqreb/JNPPuHqq6+mV69edO/e\nneHDh7Nhw4Ymx2SWTzvsmrek3kA/4Pkah/YGFmdtv0PtBG9mzfTwww9TXFzMueeey8knn8yECRNq\nlfn973/PG2+8wfTp0+vd98UvfpH58+fz/vvvc+ihh/LVr34VgMMPP5w99tiDp556qvrcSZMmcckl\nl7To+xg0aBBvv/027733Hp/+9Ke5+OKLtzo+adIkbrvtNpYvX85+++1X6/gW3/72t1m6dCl/+9vf\nePPNN5k3bx633XZbi8Zqlms75CEtSZf5FGB40gJvlpEjR1av9+/fn/79+293bGatxaBBg2jbti0R\ngSTuuOMOLrvsMiZOnMj555+PJIYMGcLw4cO58847q7vCJTFq1Ch23XXX6rrq2pedjG+55Rbuuusu\nVq9eTXFxMRdddBG/+MUvOOWUU6iqqmL69OnV3eDZmtt93aZNGy644ILq7ZtvvpkePXqwfv162rdv\nX/3+jzjiCABGjx5NWVkZK1asoGPHjtXnbdq0iXHjxrFgwQKKi4sBuO6667jqqqu4+eabmxWbWUua\nMyezNCTnyVtSWzKJ+xcRMbWOIu8APbO2eyT7aslO3ma2talTp9a65r1kyRJmzJhR3bI888wzufzy\ny5k2bRpnnnlmdbkePXrUqi973+bNm7nxxhuZMmUKy5cvrx5Ut3z5coqLi7ngggs48MAD+fjjj3nw\nwQc57rjj2CuZj3zy5MlcddVVSGLTpk2sXbuW0tLS6i8Zr776ap2vn23Tpk1897vf5dFHH2XFihXV\n03SuWLGCbt26AdCz57/+jHTp0oWOHTuydOlS+vbtW71/6dKlbNiwgYMOOmir97blC4BZvvXrl1m2\nqKOjDNgx3ebjgNci4u56jj8GXAQg6ShgVUS0jofPmu1AdbVqJ06cSEQwcOBAunXrRp8+fVi3bl2t\nrvO6rkNn73vggQd4/PHHefbZZ1m1ahULFiwgIqpfs3v37hx99NE8/PDDTJo0iQsvvLD63MGDB7Ny\n5Uqqqqp49dVX6dWrF1VVVdX7GkrcAOPHj+eZZ57hueeeY9WqVbzxxhu13vPixf+6+lZVVcXatWvp\n3r37VvV069aNdu3aMX/+fKqqqqiqqmLVqlW8//77DcZgVkhyfavYF4CvAgMk/UXSy5JOlXSFpMsB\nIuIJ4G1JfwfGAFflMiazncnEiRMZOXIkc+bM4ZVXXuGVV15hypQpTJs2jZUrVwJ1J/2a+1avXk2H\nDh3o0qULa9eu5YYbbqiV8C+88EJuv/125s6du815wJvTdb569Wp22WUXunTpwpo1a/je975Xq8zU\nqVN58cUXWbduHTfddBOVlZWUlZVtVaZt27ZceumlDBs2jBUrVgCZpP/00083OSazfMppt3lE/BFo\n8B6TiBiayzjMcqFnz72adDtXc+pvioEDB9KmTZvq7ujDDz+cRYsWcdVVV22VxAYOHMj+++/P5MmT\nOf300xtsdQNcdNFFTJ8+nb333puysjJuvfVWxowZs1WZs88+myuvvJIvf/nL7LLLLvXG2ZzR5pdd\ndhnPPPMM5eXl7LnnnowYMYLx48dvVecFF1zAddddx0svvcTnPve5rW41y37Nu+66i1tuuYXDDz+c\nlStX0rNnT4YOHcqJJ57Y5LjM8kVpuf9RUqQl1i0kpWoGm8pK3w9bny3XWG3b9ttvP8aOHcuAAQPy\nHUqL8e++9Ujb32So/rtc6xuvH49qZi3i4YcfpqioqFUlbrNC5fm8zWy7VVZW8vrrrzNp0qR8h2K2\nU3DyNrPtNiNtfZFmKeduczMzs5Rx8jYzM0sZJ28zM7OU8TVvs0aoqKjwnNA7qYqKinyHYFaLk7dZ\nIyxYsCDfIZiZVXO3uZmZWco4eZuZmaWMk7eZmVnKOHmbmZmljJO3mZlZyjh5m5mZpYyTt5mZWcqk\n6j5vPyTDzMwsZck78h1AE/mrhpmZ5YK7zc3MzFLGydvMzCxlnLzNzMxSxsnbzMwsZZy8zczMUsbJ\n28zMLGWcvM3MzFLGydvMzCxlcpq8Jd0vaZmkV+s5frykVZJeTpabchmPmZlZa5DrJ6yNB+4BJm6j\nzO8j4swcx2FmZtZq5LTlHRGzgJUNFPNTRM3MzJqgEK55HyXpL5KmSTow38GYmZkVunxPTDIbqIiI\njySdBjwK9K2v8Mis9f7JYmZm1lrMmZNZGpLX5B0Ra7LWfyfpJ5JKI6KqrvIjd1hkZmZmO16/fpll\niwkT6i63I7rNRT3XtSXtlbV+BKD6EreZmZll5LTlLekBMr3bZZIWASOA9kBExFjgHElXAhuAj4Gv\n5DIeMzOz1iCnyTsihjRw/F7g3lzGYGZm1toUwmhzMzMzawInbzMzs5Rx8jYzM0sZJ28zM7OUcfI2\nMzNLmUYlb0nnSipO1m+S9IikQ3MbmpmZmdWlsS3vmyNitaRjgBOB+4H7cheWmZmZ1aexyXtT8u/p\nwNiImEbmYStmZma2gzU2eb8jaQyZJ6A9IalDE841MzOzFtTYBHweMB04JSJWAaXAtTmLyszMzOq1\nzcejSpoNzAJ+BzwREZ8ARMS7wLu5D8/MzMxqaqjlfSTwGzKTizwn6QlJwyXVO+e2mZmZ5dY2W94R\nsRGYmSxI6g6cCvxA0n7AnyPiqhzHaGZmZlmaNKtYRCwFxgHjJBUBR+ckKjMzM6vXNrvNJe0haYSk\nYZI6SrpP0lxJU4F9IuKPOyhOMzMzSzR0zfsBoAOwP/AC8A/gHOC3ZB7UYmZmZjtYQ93me0XEjZIE\nLIyIO5L9b0j6Vo5jMzMzszo01PLeBBARASyvcWxzTiIyMzOzbWqo5b2vpMcAZa2TbO+T08jMzMys\nTg0l77Oy1n+Y/Bs1ts3MzGwHaih5dwZ6RMS9AJJeALqSSeDX5Tg2MzMzq0ND17y/CzyWtd0eOJzM\nE9e+maOYzMzMbBsaanm3j4jFWduzImIFsELS7jmMy8zMzOrRUMu7S/ZGRAzN2uza8uGYmZlZQxpK\n3s9L+kbNnZKuIPPQFjMzM9vBGuo2/zbwqKQhwMvJvsPIPHVtUC4DMzMzs7o1NKvY+8DnJQ0ADkp2\nT4uIZ3MemZmZmdWpUbOKJcm6yQlb0v3AGcCyiDiknjI/Bk4D1gKXRMScpr6OmZnZzqSha97bazxw\nSn0HJZ0G9ImI/YErgJ/mOB4zM7PUy2nyjohZwMptFDkLmJiUfR7oJGmvXMZkZmaWdrlueTdkbyD7\nPvJ3kn1mZmZWj0Zd8y4UI7PW+yeLmZlZazFnTmZpSL6T9ztAz6ztHsm+Oo3MdTRmZmZ51K9fZtli\nwoS6y+2IbnMlS10eAy4CkHQUsCoilu2AmMzMzFIrpy1vSQ+Q6d0uk7QIGEFmcpOIiLER8YSkL0r6\nO5lbxb6Wy3jMzMxag5wm74gY0ogyQxsqY2ZmZv+S79HmZmZm1kT5HrBmZmYp1Lu8nIXLPEQpX5y8\nzcysyRYuW0bkO4gmqm/kdBq529zMzCxlnLzNzMxSxsnbzMwsZZy8zczMUsbJ28zMLGWcvM3MzFLG\nydvMzCxlnLzNzMxSxsnbzMwsZZy8zczMUsbJ28zMLGWcvM3MzFLGydvMzCxlnLzNzMxSxsnbzMws\nZZy8zczMUsbJ28zMLGWcvM3MzFLGydvMzCxlnLzNzMxSxsnbzMwsZZy8zczMUsbJ28zMLGVynrwl\nnSrpDUnzJF1Xx/GLJb0v6eVkuTTXMZmZmaVZ21xWLqkI+G/gBGAp8KKkqRHxRo2iv4qIYbmMxczM\nrLXIdcv7COCtiFgYERuAXwFn1VFOOY7DzMys1ch18t4bWJy1vSTZV9OXJM2R9KCkHjmOyczMLNVy\n2m3eSI8BD0TEBkmXAxPIdLPXMjJrvX+ymJmZtRZz5mSWhuQ6eb8D9Mra7pHsqxYRK7M2fwbcXl9l\nI1syMjMzswLTr19m2WLChLrL5brb/EVgP0kVktoD55NpaVeTVJ61eRbwWo5jMjMzS7WctrwjYpOk\nocBTZL4o3B8Rr0saBbwYEb8Fhkk6E9gAVAGX5DImMzOztMv5Ne+IeBL4VI19I7LWbwRuzHUcZmZm\nrYWfsGZmZpYyTt5mZmYp4+RtZmaWMk7eZmZmKePkbWZmljJO3mZmZinj5G1mZpYyTt5mZmYp4+Rt\nZmaWMk7eZmZmKePkbWZmljJO3mZmZinj5G1mZpYyTt5mZmYp4+RtZmaWMk7eZmZmKePkbWbb1KtX\nOZJStfTqVZ7vH5tZTrXNdwBmVtgWL17GjBn5jqJpKiuX5TsEs5xy8jbbwXqXl7NwmZOLmTWfk7fZ\nDrZw2TIi30E0gfIdgJnV4mveZmZmKePkbWZmljJO3mZmZinj5G1mZpYyTt5mZmYp4+RtZmaWMk7e\nZmZmKZPz5C3pVElvSJon6bo6jreX9CtJb0n6X0m9ch2TmZlZmuU0eUsqAv4bOAU4CBgs6dM1il0G\nVEXE/sBdwO25jMnMzCztct3yPgJ4KyIWRsQG4FfAWTXKnAVMSNanACfkOCYzM7NUy3Xy3htYnLW9\nJNlXZ5mI2ASsklSa47jMzMxSqxCfbV7vo5TT+Izlysp8R9A0Uhp/yumTtp9y2j7H4M/yjpDGn3Aa\nP8t1yXXyfgfIHoDWI9mXbQnQE1gqqQ1QEhFVNSuKiDR+TszMzFpcrrvNXwT2k1QhqT1wPvBYjTKP\nAxcn6+cCz+Y4JjMzs1TLacs7IjZJGgo8ReaLwv0R8bqkUcCLEfFb4H7gF5LeAlaQSfBmZmZWD0Wk\naWZhMzMz26mfsCZps6Q7sravkXRLE+s4TdKLkuZKmp1dn1mhkzRI0l8kvZwsf5G0SdI3Jf01KXOY\npLuS9RGSvpPfqM0yks/qy5LmSHpJ0lHJ/ootn9/WaqdO3sA64EvNvTVN0sHAPcCQiDgYOBz4ewvG\n19Drt9lRr2WtU0Q8GhGfjYhDI+JQ4CfAc8CTQCRlZkfE1fmM06wea5PPbj/gRuC2rGOtult5Z0/e\nG4GxQK2WRPLN7ZnkG93/SOpRx/nXAj+IiLcAImPMts6XNF7ST5JHwf5d0vGS7pf0mqRxWa+/WtKd\nSYv+fySVJftnSPqRpBeAYS3+E7GdlqS+wC3AhWT94Us+o49nFe0n6U+S3pT09R0dp1mW7LuQOgG1\n7lSS1EHSOEmvJr2j/ZP9B0p6Pqvl3ifZf5GkV5JeqAk16ysUO3vyDuBe4KuSimscuwcYn3yjeyDZ\nrulgYHY9dW/r/M4RcTSZLw2PAf8VEQcCh0g6JCmzO/BC0qL/PTAi6/x2EXFERPyosW/UbFsktQV+\nCXw7Imrezglbt2L+D9Af+Dxwi6Ty3EdoVqddk+T7OpmG2K11lPkWsDkiDgGGABOSu5++CdyV9Dgd\nDiyRdCCZFnz/iPgsMHyHvItm2NmTNxGxhszjWWv+ko4GJifrvwCOaWLVNc//QtaxLa2YvwLvRcRr\nyfbfgN7J+mbgwWR9Uo3X/3UTYzFryA+AuRExpRFlp0bE+ohYQebWziNyG5pZvT5Kus0PAE4j87e2\npmPI/A0lIt4EFgB9gf8Fvifpu0DviFgHDAAeioiVSflVuX8LzbPTJ+/E3WQmSNk9a1/N6yV1XT+Z\nS+YbW122db1lXfLv5qz1Ldv13b6XXd/abdRt1iRJN+LZZFoojZH9WRSt/NqipUNE/BnYQ9IeDRRV\nUn4yMBD4GJi2pTudlDw4bmdP3lt+iSvJtHIvyzr2J2Bwsn4B8Ic6zv8hcIOk/SEzi5qkK5pwfnUM\ndSgCzknWvwrM2uY7MWsGSV2AccBFEfFRI087K5nKtww4nszDmMzyofrvZzJjZRGZ54Vk+wOZv6Fb\nxnX0BN6pdaseAAAgAElEQVSUtE9EvB0R95C5fHkImZ6kc7YMYk7+fxSkQny2+Y6U3WL4LzItjy37\nhgHjJf0b8AHwtVonR/xV0tXAZEm7Juf+toHzt9Wir9m6PkLSzcAy4Cv1nG+2Pa4AugL3Jc8C39KS\n/tU2znkVmAmUAd+PiPdyHKNZfXaR9DL/SuIXRUTUeK79T8h8vl8FNgAXR8QGSedJujDZ9y7w7xGx\nStK/A89J2gj8Bbh0h72bJvBDWgqUpNURUXMQnZmZ2U7fbV7I/K3KzMzq5Ja3mZlZyrjlbWZmljJO\n3mZmZinj5G1mZpYyTt5mZmYp4+RtZmaWMk7eZq1AMttcQT5MoiHJHOF1PZO6vvKbJe2by5jMCp2T\nt9l2kHSMpD9KWiVpuaQ/SDos33FlS5LjxHzH0YCm3LPaqLLJtLybJfnvnLU6O/vjUc2aLZlG9nEy\njxh9CGgPHMvWk820OpKKImJzPkNoQrloQnmz1PA3UrPm6wtERDwYGesi4umImAsg6WJJsyTdk7TM\nX5M0YMvJkkok/UzSUkmLJd2qrIcyS7o0OWeFpN9J6pV17CRJr0taKeke6klQkk4hMz/xVyStlvSX\nZH83SVOTuudJ+np9b1LSeEk/kTRN0mqgfzIxyQ8lLZT0bnK8Q1K+s6THJb2f1P+4pO5Z9fWWNFPS\nh5KmA9ucBUrStcnPaImkr5HV8pb0xWQ+5w+TWLLnvX8u+XeVpH9KOlLSvpKeSXpJ3pc0SVLJtl7f\nrBA5eZs13zxgk6SfSzpVUuc6yhwJvEVmEo+RwCNZ5SYA64F9gc8CJwFfB5B0FnA9MIjMxCF/IJkf\nPpny8GEySXkPYD5bzxdfLSKmA6OBX0dEcUR8Njn0a2ARUA6cC4zOmhKxLoOBW5Pn7f8R+E9gPzIz\nMe0H7A3ckpQtIjNTWU+gF/ARcG9WXQ+QmYlsDzLziF9c34tKOhX4DnACsD9wYo0ia4ALI6ITcDrw\nTUlnJseOS/4tiYiSiHiezJec0cn7PgDoQeb3YpYuEeHFi5dmLsCnyCSqRWQS8VSga3LsYmBJjfLP\nk5mecE/gE6BD1rHzgWeS9SeAr2UdKyIz01xP4ELgTzXqXQxcWk+MI4CJWds9yMyktFvWvtHAuHrO\nHw/8vMa+NcA+WdtHA/+o5/x+wIpkvVfyc9o16/gvs+Orce79wOis7f2BTcC+9ZT/EfBfyXpFUrZo\nG7+/s4DZ+f4cefHS1MXXvM22Q0S8STJlYDJX8C+Bu0jmDwbeqXHKQqA7mcTSDng3aypOkfkSQHL8\nbkn/lWxvuX67d3L+4hr11tzelu5AVWw9f/dCYFsD7arrl9QV2A2YndXLX5TESDI97l3AKUDnZH/H\n5JJAN2BlRHxc47V7bCPWl2qUzb60cCTwH8DBZMYctCcz/qBOkvYE7iYzNqEj0Aaoqv9tmxUmd5ub\ntZCImAf8nEwi2WLvGsV6AUvJJMNPgLKIKI2ILhHROSIOScotAq5Ijm053jEi/kxm7uFeNertua3Q\namwvBUol7V4jrppfNOqrYzmZrvCDsuLrHJmua4BryLSQPxcRnflX97WS2LskCT77tevzLlu/t4oa\nsfwSeBTYO3mtMfwrudc1Kn00sDmJvTNwAR7QZink5G3WTJI+Jek7kvZOtnuSuTb8v1nF9pT0fyW1\nlXQu8GngiYh4D3gK+JGkYmXsK2lLohsD3CjpwKTuTpLOSY5NAw6UNEhSG0nDgb22EeoyoPeWwXAR\nsQT4E/AfkjpIOgS4DGjUvdYREcD/A+5KWuFI2lvSyUmRYuBj4J+SSsm6phwRi8i0pEdJaifpGGDg\nNl7uQeASSQdI2o1/XVffoiOZlvwGSUcAQ7KOfUAmUffJ2ldMpst/dfJ7u7Yx79ms0Dh5mzXfajID\n0p5PRmH/CXgV+LesMs+TaYUuB24FvhwRK5NjF5Hp5n2NTNftQ2QGUhERjwK3Ab+StCqp99Tk2Aoy\ng8z+M6m3D5lBZPV5iEzrcoWkLV3QQ4B9yLTCHwZujogZ9ZxfVwv2OuDvwJ+T+J4iM/oeMl3muyWx\n/YnM9ftsQ4CjgBXAzWQG7tX9whFPJvU9S2aA4DM1ilwF3CrpQ+AmMgPxtpz7MfDvwB8lVSXJfRSZ\nywOryNzm93B9r21WyApiPu/kWuGv+dc9mfuS+WPy47wGZrYdJF0MXBYRxzVY2MysCQpiwFpyrfCz\nkHkABLAE+E1egzIzMytQhdhtfiIwPyKaMnrWzMxsp1EQ3ebZJN1P5r7Ln+Q7FjMzs0JUUMlbUjsy\nA2gOjIgPahwrnEDNzMx2kIiodTtjQVzzznIamVb3B3UdLKQvGo0xcuRIRo4cme8wzLaLP8fWWqTx\ns5z1IKStFNo178Ekz282MzOzuhVM8k4ewHAi8Ei+YzEzMytkBdNtnjxnuWu+42hJ/fv3z3cIZtvN\nn2NrLVrTZ7mgBqxti6RIS6xmZmYtQVIqBqyZFaTevXuzcOHCfIdheVBRUcGCBQvyHYbZVtzyNmuE\n5NtvvsOwPPDv3vKpvpZ3wQxYa4169SpHUmqWXr3K8/0jMzOzRnDLO4ckMaO+eZoKUGVl+u6l31Hc\n+tp5+Xdv+eSWt5mZWSvh5G1mZpYyTt5mrdgDDzzAqaee2iJ1VVZWMm7cuBapq7GmT5/O/vvvX+/x\nwYMHM3r06B0YkVlhcPI2a6be5bkdkNi7vHEDCGfNmsUXvvAFOnfuzB577MGxxx7L7NmzARgyZAhP\nPvlkLn8MDcZWXFxMSUkJHTt2pKioiJKSkup9S5YsabCO+p7tbLYz833eZs20cNkycjmMScuWNVhm\n9erVDBw4kDFjxnDuueeyfv16/vCHP9ChQ4ccRtZ4xxxzDKtXrwZg4cKF7Lvvvnz44YdOyGbbyS1v\nsxSbN28ekjjvvPOQRIcOHTjxxBM5+OCDAZgwYQLHHntsdfmioiLGjBlD3759KS0tZejQodXHNm/e\nzDXXXEPXrl3p06cP9957L0VFRWzevLnO1x43bhwHHnggZWVlnHbaaSxatKhRMdccuT127FgOOOAA\nSkpK6Nu3L+PHj69VfuTIkZSVldGnTx+mTJlSb92/+c1v+MxnPkOXLl04/vjjef311xsVk1naOHmb\npVjfvn1p06YNl1xyCU8++SSrVq2qVaZmK3fatGnMnj2bV155hQcffJCnnnoKyCTR6dOn8+qrr/Ly\nyy/z6KOP1ttCnjp1KrfddhuPPvooH3zwAcceeyyDBw9u1nvo3r0706dP55///Cc//elP+da3vrVV\n0l2wYAEbN25k2bJljB07losvvrjOp939+c9/ZujQoUyYMIGqqiouvPBCBg0aVO+XD7M0c/I2S7Hi\n4mJmzZpFUVERl19+OXvuuSdnnXUWH3zwQb3n3HDDDRQXF9OzZ08qKyuZM2cOAA899BDDhw+nW7du\ndOrUieuvv77eOsaMGcMNN9xA3759KSoq4vrrr2fOnDksXry4ye/hjDPOoFevXgAMGDCA448/nlmz\nZlUfb9euHbfccgtt27blhBNO4MQTT6yz9T127FiGDh1Kv379kMTXv/511q1bV33936w1cfI2S7lP\nfepTjBs3jkWLFjF37lyWLl3K1VdfXW/5vfbaq3p9t912Y82aNQAsXbqUnj17Vh/LXq9p4cKFDB8+\nnNLSUkpLSykrK0MS77zzTpPjf+yxxzjyyCMpKyujS5cuzJgxg+XLl1cf79q1K+3bt6/erqioYOnS\npXXGNHr06OqYunTpwvLly5sVk1mhc/I2a0X69u3LJZdcwty5c5t8brdu3bYa/b2ta9g9e/ZkzJgx\nVFVVUVVVxcqVK1mzZg1HHXVUk17zo48+4rzzzmPEiBEsX76clStXUllZudV18eXLl7N+/fqt4ure\nvXudMX3/+9+vFdOgQYOaFJNZGjh5m6XYm2++yZ133lnduly8eDGTJ0/m6KOPbnJd5513HnfffTdL\nly5l1apV3H777fWW/eY3v8no0aN57bXXAPjwww+3OZBsi5qD1T7++GM2btxI165dgUwrfObMmVuV\nWb9+PbfeeisbNmzg2Wef5emnn+acc86pVffll1/OPffcU91NvmbNGh5//HE++eSTBuMySxvfKmbW\nTBV77dWo27m2p/6GFBcX8/zzz3PnnXfy4Ycf0rlzZwYOHFhv4q05AC17+xvf+AZvvfUWhxxyCJ06\ndWLYsGE899xzFBUV1So7aNAg1q5dy/nnn8+iRYvo1KkTJ510Up1JdVuvX1ZWxg9/+EPOOOMMNm7c\nyNlnn83pp5++VZl99tmHtm3bUl5eTqdOnfj5z39ORUVFrfo+//nP8+Mf/5grrriC+fPns/vuu3Pc\nccdx8sknbzMmszQqmIlJJHUCfgYcDGwGLo2I57OOe2KSHPPEJPXbGSenePLJJ7nyyit5++238x1K\nXu2Mv3srHGmYmORu4ImIOAD4DOAbNM12oE8++YTf/e53bNq0iXfeeYdRo0bxpS99Kd9hmVkdCiJ5\nSyoBjo2I8QARsTEi/pnnsMx2KhHBiBEjKC0t5bDDDuOggw5i1KhR+Q7LzOpQKNe89wGWSxpPptX9\nEjA8Ij7Ob1hmO49dd92VF154Id9hmFkjFErybgscCnwrIl6SdBdwPTAiu9DIkSOr1/v370///v13\nYIhmZma5NXPmzFp3XNSlIAasSdoL+N+I2DfZPga4LiIGZpXxgLUc84C1+nnQ0s7Lv3vLp4IesBYR\ny4DFkvomu04AXstjSGZmZgWrULrNAYYBv5TUDvgH8LU8x2NmZlaQCiZ5R8QrwOfyHYeZmVmhK4hu\nczOz7TF48GBGjx5d57F169ZRVFRU52QmZmnl5G3WTL16lSMpZ0uvXuWNjqV3797stttulJSUUFxc\nTElJCcOGDas+PnPmTIqKirjjjjty8aPYbldeeWV13B06dKB9+/aUlJRQUlJS63GpzVHfvORmaVUw\n3eZmabN48bKc3k1QWdn456ZLYtq0aVRWVtZ5fOLEiZSVlTFx4kSuvfbaeuvZtGkTbdq0aXBfcy1c\nuJD+/fvXeuTqfffdx3333QfAqFGjmD9/PhMnTmyR1wTfRWGtj1veZq1EfQnqo48+YsqUKdx77728\n9dZbvPzyy9XHFi5cSFFREePGjaOiooITTjihzn2QmXWsW7dudOnShf79+1fPKPbSSy9RXl6+1es/\n8sgj9OvXr854mtMK3rRpE+eccw7l5eWUlpZywgknMG/evK3KvPfeewwYMICSkhJOOumkervJP/nk\nE66++mp69epF9+7dGT58OBs2bGhyTGb55ORt1so9/PDDFBcXc+6553LyySczYcKEWmV+//vf88Yb\nbzB9+vR6933xi19k/vz5vP/++xx66KF89atfBeDwww9njz324Kmnnqo+d9KkSVxyySUt+j4GDRrE\n22+/zXvvvcenP/1pLr744q2OT5o0idtuu43ly5ez33771Tq+xbe//W2WLl3K3/72N958803mzZvH\nbbfd1qKxmuWak7dZKzFo0CBKS0vp0qULpaWl3H///UCmy/z8889HEkOGDOFXv/oVmzZtqj5PEqNG\njWLXXXelQ4cO9e675JJL2G233WjXrh233HILr7zyCqtXrwbgoosu4he/+AUAVVVVTJ8+ncGDB9eK\nsbnd123atOGCCy5g1113pX379tx88828+OKLrF+/fqv3f8QRR9C+fXtGjx7NM888w4oVK7aqZ9Om\nTYwbN467776b4uJiiouLue6665g8eXKz4jLLl1Ql71wODsrFYrYjTZ06laqqKlauXElVVRWXXXYZ\nS5YsYcaMGQwZMgSAM888k48//php06ZtdW6PHj1q1Ze9b/PmzVx//fXst99+dO7cmX322QdJLF++\nHIALLriA3/72t3z88cc8+OCDHHfcceyVzEc+efLk6i8Un/nMZ1i0aNFWXzKWLFnS4HvbtGkT11xz\nDX369KFz584ccMABRMRWyblnz57V6126dKFjx461us6XLl3Khg0bOOiggygtLaW0tJRBgwZVvw+z\ntEhV8o6ULWY7Ul2t2okTJxIRDBw4kG7dutGnTx/WrVtXq+u8ri+b2fseeOABHn/8cZ599llWrVrF\nggULiIjq1+zevTtHH300Dz/8MJMmTeLCCy+sPnfw4MHVXyheffVVevXqtdWXjLq+ONQ0fvx4nnnm\nGZ577jlWrVrFG2+8Ues9L168uHq9qqqKtWvX0r17963q6datG+3atWP+/PlUVVVRVVXFqlWreP/9\n9xuMwayQpCp5m1nTTJw4kZEjRzJnzhxeeeUVXnnlFaZMmcK0adNYuXIlUHfSr7lv9erVdOjQgS5d\nurB27VpuuOGGWgn/wgsv5Pbbb2fu3LnbnAe8OV3nq1evZpdddqFLly6sWbOG733ve7XKTJ06lRdf\nfJF169Zx0003UVlZSVlZ2VZl2rZty6WXXsqwYcOqW+2LFy/m6aefbnJMZvnkW8XMmqlnz72adDtX\nc+pvioEDB9KmTRsiAkkcfvjhLFq0iKuuumqrJDZw4ED2339/Jk+ezOmnn95gqxsy17SnT5/O3nvv\nTVlZGbfeeitjxozZqszZZ5/NlVdeyZe//GV22WWXeuNsziWlyy67jGeeeYby8nL23HNPRowYwfjx\n47eq84ILLuC6667jpZde4nOf+9xWt5plv+Zdd93FLbfcwuGHH87KlSvp2bMnQ4cO5cQTT2xyXGb5\nUhCzijWGpJRE+i8CzyrWSnhmqcbZb7/9GDt2LAMGDMh3KC3Gv3vLp4KeVczM0u/hhx+mqKioVSVu\ns0LlbnMz226VlZW8/vrrTJo0Kd+hmO0UnLzNbLvNSNP1IbNWwN3mZmZmKePkbWZmljIF020uaQHw\nIbAZ2BARR+Q3IjMzs8JUMMmbTNLuHxEr8x2IWU0VFRV+5O1OqqKiIt8hmNVSSMlbuBvfCtSCBQvy\nHYKZWbVCSpYBTJf0oqRv5DsYMzOzQlVILe8vRMS7kroC/yPp9YiYlV1gZNZ6/2QxMzNrLWbOnMnM\nmTMbLFeQj0eVNAJYHRF3Zu0rwEi3zY9HNTOz7VHQj0eVtJukjsn67sDJwNz8RmVmZlaYCqXbfC/g\nN5KCTEy/jIin8hyTmZlZQSqI5B0RbwP98h2HmZlZGhREt7mZmZk1npO3mZlZyjh5m5mZpYyTt5mZ\nWco4eZuZmaWMk7eZmVnKOHmbmZmljJO3mZlZyjh5m5mZpYyTt5mZWco4eZuZmaWMk7eZmVnKOHmb\nmZmljJO3mZlZyjh5m5mZpYyTt5mZWcoUTPKWVCTpZUmP5TsWMzOzQlYwyRsYDryW7yDMzMwKXUEk\nb0k9gC8CP8t3LGZmZoWuIJI38CPgWiDyHYiZmVmha5vvACSdDiyLiDmS+gOqr+zIrPX+yWJmZtZa\nzJw5k5kzZzZYThH5bexKGg1cAGwEdgWKgUci4qIa5fIcadMJmDEj31E0XmUl5PvzYGZm/yKJiKjV\nqM17t3lE3BgRvSJiX+B84NmaidvMzMz+Je/J28zMzJom79e8s0XEc8Bz+Y7DzMyskLVYy1vSuZKK\nk/WbJD0i6dCWqt/MzMwyWrLb/OaIWC3pGOBE4H7gvhas38zMzGjZ5L0p+fd0YGxETAPat2D9ZmZm\nRssm73ckjQG+AjwhqUML129mZma0bHI9D5gOnBIRq4BSMk9NMzMzsxa03aPNJc0GZgG/A56IiE8A\nIuJd4N3trd/MzMy21hIt7yOB35B5Wulzkp6QNFxS3xao28zMzGrY7pZ3RGwEZiYLkroDpwI/kLQf\n8OeIuGp7X8fMzMwyWvwhLRGxFBgHjJNUBBzd0q9hZma2M9vubnNJe0gaIWmYpI6S7pM0V9JUYJ+I\n+GMLxGlmZmaJlrjm/QDQAdgfeAH4B3AO8FsyD2oxMzOzFtQS3eZ7RcSNkgQsjIg7kv1vSPpWC9Rv\nZmZmWVqi5b0JIDITQS+vcWxzC9RvZmZmWVqi5b2vpMcAZa2TbO/TAvWbmZlZlpZI3mdlrf8w+Tdq\nbJuZmVkLaYnk3RnoERH3Akh6AehKJoFf1wL1m5mZbbdevcpZvHhZvsNoES2RvL8LnJ+13R44HNgd\nGA881AKvYWZmtl0WL17GjBn5jqJpKivr3t8Sybt9RCzO2p4VESuAFZJ2b0wFyQxkvyeT+NsCUyJi\nVAvEZmZm1uq0RPLukr0REUOzNrs2poKIWCepMiI+ktQG+KOk30XECy0Qn5mZWavSEreKPS/pGzV3\nSrqCzENbGiUiPkpWO5D5UhHbKG5mZrbTaomW97eBRyUNAV5O9h1GJgkPamwlyXPQZwN9gHsj4sUW\niM3MzKzVaYlZxd4HPi9pAHBQsntaRDzbxHo2A5+VVELmy8CBEfFadpmRWev9k8XMzKy1mDMnszRE\nmQejFRZJNwNrI+LOrH0FGOm2CVI1srGyEgrx82Bm1hIkpepvMlT/XVbN/S1xzXu7JTOTdUrWdwVO\nAt7Ib1RmZmaFqcXn826mbsCE5Lp3EfDriHgizzGZmZkVpIJI3hHxV+DQfMdhZmaWBgXRbW5mZmaN\n5+RtZmaWMk7eZmZmKePkbWZmljJO3mZmZinj5G1mZpYyTt5mZmYp4+RtZmaWMk7eZmZmKePkbWZm\nljJO3mZmZinj5G1mZpYyTt5mZmYp4+RtZmaWMk7eZmZmKePkbWZmljIFkbwl9ZD0rKS/SfqrpGH5\njsnMzKxQtc13AImNwHciYo6kjsBsSU9FxBv5DszMzKzQFETLOyLei4g5yfoa4HVg7/xGZWZmVpgK\nInlnk9Qb6Ac8n99IzMzMClOhdJsDkHSZTwGGJy3wrYzMWu+fLGZmZq3FnDmZpSEFk7wltSWTuH8R\nEVPrKjNyh0ZkZma2Y/Xrl1m2mDCh7nKF1G0+DngtIu7OdyBmZmaFrCCSt6QvAF8FBkj6i6SXJZ2a\n77jMzMwKUUF0m0fEH4E2+Y7DzMwsDQqi5W1mZmaN5+RtZmaWMk7eZmZmKePkbWZmljJO3mZmZinj\n5G1mZpYyTt5mZmYp4+RtZmaWMk7eZmZmKePkbWZmljJO3mZmZinj5G1mZpYyTt5mZmYp4+RtZmaW\nMk7eZmZmKePkbWZmljIFkbwl3S9pmaRX8x2LmZlZoSuI5A2MB07JdxBmZmZpUBDJOyJmASvzHYeZ\nmVkaFETyNjMzs8Zrm+8AmmJk1nr/ZDEzM2st5szJLA1JbfI2MzNrbfr1yyxbTJhQd7lC6jZXspi1\nar3Ly5GUmqV3eXm+f2RmVkNBJG9JDwB/AvpKWiTpa/mOySxXFi5bRkBqloXLluXoJ2FmzVUQ3eYR\nMSTfMZiZmaVFQbS8zczMrPGcvM3MzFLGydvMzCxlCuKat5mZpUvv8nIPZswjJ28zM2uyLXdNpElr\nuhfZ3eZmZmYp4+RtZmaWMk7eZmZmKeNr3ma2Te3agZSuq4U9e+7FokXv5TsMs5xx8jazbdqwAWbM\nyHcUTVNZ6VHQ1rq529zMzCxlnLzNzMxSxsnbzMwsZZy8zczMUsbJ28zMLGWcvM3MzFLGydvMzCxl\nCiZ5SzpV0huS5km6Lt/xmJmZFaqCSN6SioD/Bk4BDgIGS/p0fqMyMzMrTAWRvIEjgLciYmFEbAB+\nBZyV55jMzMwKUqEk772BxVnbS5J9ZmZmVkOqnm2erqkRMior8x1B06RtAoq0SttPOW2fY/BneUdI\n4084jZ/luhRK8n4H6JW13SPZVy0i0vg5MTMza3GF0m3+IrCfpApJ7YHzgcfyHJOZmVlBKoiWd0Rs\nkjQUeIrMF4r7I+L1PIdlZmZWkBQR+Y7BzMzMmqBQus3zQtJmSXdkbV8j6ZYm1nGapBclzZU0O7s+\ns0InaZCkv0h6OVn+ImmTpG9K+mtS5jBJdyXrIyR9J79Rm2Ukn9WXJc2R9JKko5L9FVs+v63VTp28\ngXXAlySVNudkSQcD9wBDIuJg4HDg7y0YX0Ov32ZHvZa1ThHxaER8NiIOjYhDgZ8AzwFPApGUmR0R\nV+czTrN6rE0+u/2AG4Hbso616m7lnT15bwTGArVaEsk3t2eSb3T/I6lHHedfC/wgIt4CiIwx2zpf\n0nhJP5H0v5L+Lul4SfdLek3SuKzXXy3pzqRF/z+SypL9MyT9SNILwLAW/4nYTktSX+AW4EKy/vAl\nn9HHs4r2k/QnSW9K+vqOjtMsS/ZdSJ2AqloFpA6Sxkl6Nekd7Z/sP1DS81kt9z7J/oskvZL0Qk3Y\nIe+iGXb25B3Avf+/vfsJlaqMwzj+faQiCcHIoE1gi4SkLgQhmELSzkVIIEZpRrpQCKwgghANNHdJ\nipC7RMqu/dkUtY3KSBFUUEtc1aLICFEoDTXu0+J9jw3HuXPvXLneO93ns5n3zHl/Z87LzJz3/N5z\n5h1gtaQ5rXV7gH31jO7Dutz2MHBslG33ip9rezHlpOFzYKfthcCQpKFa5y7gaM3ovwXe7Ii/3fYi\n2++Mt6ERvUi6DTgAvGr71y5VOrOYR4BlwOPAVkn3Tf4eRnQ1u3a+ZyiJ2PYudV4CRmwPAc8B++uv\nmjYCu+qI02PAL5IWUjL4ZbYfBV6+Ja2YgJneeWP7L2A/N75Ji4HhWn4fWNrnptvxSzrWNVnMKeCc\n7R/r8g/A/FoeAT6u5Q9ar/9Rn/sSMZa3gNO2Px1H3c9sX7V9HviKMr1xxFS4XIfNHwKWU461bUsp\nx1BsnwV+BhYAh4HNkl4H5tu+AjwJfGL7Qq1/cfKbMDEzvvOudgPrKdluo329pNv1k9OUM7Zuel1v\nuVIfRzrKzfJoP9/r3N6lHtuO6EsdRnyakqGMR+dnUfzPry3GYLB9BJgnad4YVVXrDwNPAX8DXzbD\n6QzIxHEzvfNu3sQLlCx3fce674Fna3kNcKhL/NvAG5IehPLvaJI29BF/fR+6mAWsrOXVwHc9WxIx\nAZLuBt4D1tq+PM6wFZLuqPdhPEGZZCliKlw/ftZ/opwFnG/VOUQ5hjb3ddwPnJX0gO2fbO+hXL4c\nog9LaEQAAADaSURBVIwkrWxuYq7fj2lpWkzSMoU6M4adlMyjeW4TsE/Sa8AfwIs3BNunJL0CDEua\nXWO/GCO+V0bfzq4XSdoC/A48M0p8xM3YANwL7K1zgTeZ9MEeMSeBr4F7gG22z03yPkaM5k5Jx/mv\nE19r26157d+lfL5PAteAF2xfk7RK0vP1ud+AHbYvStoBfCPpH+AEsO6WtaYPmaRlmpL0p+32TXQR\nEREzfth8OstZVUREdJXMOyIiYsAk846IiBgw6bwjIiIGTDrviIiIAZPOOyIiYsCk846IiBgw/wJt\n8i91O5CzigAAAABJRU5ErkJggg==\n", + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "fig, (ax1, ax2) = plt.subplots(nrows=2, sharex=False, sharey=False)\n", + "fig.set_size_inches(w=8, h=7, forward=False)\n", + "f = .550 # conversion factor to GB/s\n", + "rects1 = ax1.bar(np.arange(3), f / np.array([.323, 3.17, .599]), 0.25, color='r')\n", + "rects2 = ax1.bar(np.arange(3) + 0.25, f / np.array([.250, 2.74, .258]), 0.25, color='y')\n", + "_ = ax1.set_ylabel('GB/s')\n", + "_ = ax1.set_xticks(np.arange(3) + 0.25)\n", + "_ = ax1.set_xticklabels(('No Compr', 'Zlib', 'Blosc'))\n", + "_ = ax1.legend((rects1[0], rects2[0]), ('Single Table', 'EArray+Table'), loc=9)\n", + "_ = ax1.set_title('Speed to store data')\n", + "\n", + "rects1 = ax2.bar(np.arange(3), f / np.array([.099, .592, .782]), 0.25, color='r')\n", + "rects2 = ax2.bar(np.arange(3) + 0.25, f / np.array([.082, 1.09, .171]), 0.25, color='y')\n", + "_ = ax2.set_ylabel('GB/s')\n", + "_ = ax2.set_xticks(np.arange(3) + 0.25)\n", + "_ = ax2.set_xticklabels(('No Compr', 'Zlib', 'Blosc'))\n", + "_ = ax2.legend((rects1[0], rects2[0]), ('Single Table', 'EArray+Table'), loc=9)\n", + "_ = ax2.set_title('Speed to read data')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And now, see the different sizes for the final files:" + ] + }, + { + "cell_type": "code", + "execution_count": 190, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAfIAAAFCCAYAAAAKW1+lAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzt3XucX/O97/HXZxIJKnckQi4aotRBe2xFSzKoU5dUONii\nLim910Evjth716XdJ0e1u0WrLmeLJrXdSutStjgqo7V3qbpEFYk6zZ0Qk2gSJCSf88dvTfwymZlM\n0vxmssbr+Xj8Htblu9b6rF9+5v37rrV+a0VmIkmSyqmuswuQJEkbzyCXJKnEDHJJkkrMIJckqcQM\nckmSSswglySpxAxyaQNFxMkRcX8N1ntDRDRGxKObet2dKSKGRcTqiGjX35viffh2reuSugqDXGpB\nRHwiIv4jIpZExKKI+G1E/FeAzLwpMz+1qbcHHAoMzsz9N+W6W9jWtIg4o5bbaEFNbljRUfvSSe+Z\n1C7dO7sAaXMTEb2Ae4AvAj8HegAHAStquNnhwKzMfHtDF4yIbpm5atOX1Or2Ir2TlLTZsEcurWsk\nkJl5W1asyMwHM/NZgIg4PSJ+WwyfFxFLI+KvxWtlREwq5vWOiH+NiAURMTcivhMR0XxjRU/v/wAH\nFOu4qJj++Yh4sTgicGdE7FC1zOqI+EpEzARmtrDOnhHxs2LZxRHxWERsFxH/TOVLyY+LbV1ZtD8w\nIn5f1faAqnVNi4h/johHImI5sHOxb9e3tG8RURcR34+I1yLiz8BRbb3ZEfGRiHgiIt6IiFuALavm\n9Y2IeyLi1Yh4vRgeXMxrbV8uj4g5xfoeL452NK3v74ppb0TEyxHx/ap5+xdHYRZHxFMRMWo92/lh\nRCws1jU9IvZoaz+lmslMX758Vb2AXsBrwE+BTwF9m80/HfhNC8vtBMwDDi/Gfwn8hEowbQs8Cny+\nlW2utU7gkKKGvYEtgCuBh6vmrwamAn2Ani2s7wvAXUBPIICPANsU86YBZ1S17Qc0AidT+XJ/UjHe\nr6r9LOBDxfzube0b8CXgOWAw0Bd4CFgF1LVQ5xbFus8GugH/HVgJfLuY3x84ttiPDwC3Ar+sWn6t\nfSmmnVxstw74GvAy0KOY95/AZ4rhrYH9iuHBwCLgvxXjhxbjA1p5zw4HHgd6FeO7AQM7+7Pr6/35\nskcuNZOZS4FPUAnL64BXI+KuiNiutWUiYivgTuDyzHwgIrYHjgC+lplvZ+Yi4HJgXDvLOBm4PjOn\nZ+Y7wAVUeuxDq9pMzMw3MrOlQ/7vAAOAkVnxVGYua2VbRwEzs3Luf3Vm3gK8AIypavPTzHwhM1dT\nCdeW9u2kou0JxfuwIDOXAP+7jf3cH+iemVdm5qrMvINKQAKQmY2Z+cusHBVZXqzr4DbWR7EfS4p9\n+SGVLwG7FbNXArtExIDMfDMzf19MPwW4NzOnFuv4NfAH4MhWNvMOlS98exSnGmZk5sK26pJqxSCX\nWlD8YT4jM4cCe1LpsV3exiLXA89nZtOh2mFUepsvF1eiLwauodJ7bY/BwOyqepYDrwM7VrWZ18by\nU6j02G+JiHkR8d2I6NaebRVmN9vW3Krh1vat6YvO4Gbtm6+7+bbnt7BtoPIFKSKujYhZEbEEeBjo\n29IpiqplvhkRzxWHyBcDvXnvfT+TSqi/UJxCaDrsPww4sdifpn36ODCopW1k5jTgx8BVwMKIuCYi\ntmljP6WaMcil9cjMmVQOs+/Z0vyImADsQiUkmswF3qZyaLZ/ZvbLzL6ZuVc7N7uASrg0beMDVHrY\n1eHd6gVnRe/2O5n5YeBA4GjgtFaWW0DlYrtqQ1k7YKuXWd++vQwMqWo/jNa9zNpfGJq23eSbwK7A\n32VmX97rjTcF+Vr7UpwPPw84vqirH/DXpvaZ+VJmnpyZ2wGXAbcXR1PmAlOK/Wnap16Z+b2WtlOs\n68eZuS+wB5UvB+e1sZ9SzRjkUjMRsVtEfD0idizGh1A5JP67FtoeAfwP4NjMXNk0PTNfAR4AfhgR\nvaLigxHR5mHhKjcDn42IvSKiJzAReDQz565nuaa6RkfEnlH57fYyKoeCm65sXwh8sKr5fcCuEXFS\nRHSLiL8Hdqdy5f462rFvtwFnR8SOEdEPOL+NUn8HvBsR/yMiukfEccB+VfO3Ad4C/hoR/YGLmy3f\nfF96Ffv6ekT0iIgLi2lN78tnIqKpd/4GlYBeDdwIjImIw4uL9baMiFFNF9Y1305E7BsR+0VE96K+\nt4v1SB3OIJfWtRT4GPBYRCylcoHUM1R6h82dSOWw7fPx3tXrPynmnU7lp2vPUbl47Oe0cqi2ueIc\n7beAX1DpGe/Me+egYf2/yx4E3E4lrP5E5WKtG4t5VwAnFFeBX56ZjVR67N+kcoHXN4GjMnNxG9s6\nrY19+z9UDutPp3Ke+Y429vMd4Djgs1ROHZzQrP3lVC5KW0Tl3+G+ZqtYa1+A+4ttzwT+ArzJ2of5\nPwX8KSL+CvwQ+Pvi/Ps84BjgH6hcZDi7eB/qWtlO72I/G4vtLAK+h9QJIrO2PweNiD7Av1I5LLka\nOIPK/2S3UjnkNgs4MTPfKNpfSeVCmuXA+Mx8uqYFSpJUYh3RI78CuC8zd6fyU5oXgAnAg5m5G5Wf\nplwAaw5TjsjMXancjOOaDqhPkqTSqmmPPCJ6A09l5ohm018ARmXmwogYBEzLzN0j4ppi+Nai3fPA\naH/WIUlSy2rdI98ZWBSVhyA8GRHXRcTWVG6csBDWXDgzsGi/I2ufz5rPule0SpKkQq2DvDvwUeCq\nzPwolfPeE1j34hnv2yxJ0kao9UNT5gFzM/MPxfgdVIJ8YUQMrDq0/moxfz5r//50J9a9WQQRYfBL\nkt5XMrPFGyHVtEdeHD6fGxEji0mHUvkpzN3A+GLaeCr3hKaYfhpUHmAALGnt/Hit7127qV8XXXRR\np9fgy9ff+vJz7KurvMr2WW5LRzzG9Gzg3yJiC+D/Ufm9aDfgtqg89Wk2ld/ikpn3RcSRUXli0vKi\nrSRJakXNgzwzpwN/18Ksw1ppf1ZtK5Ikqevwzm4dZPTo0Z1dgvQ383OsrqIrfZZrfme3Wqg8NbB8\ndUuStDEigmzlYreOOEcudSnDhw9n9uy2nsyprmrYsGHMmjWrs8uQ1mKPXNpAxTfjzi5DncB/e3WW\ntnrkniOXJKnEDHJJkkrMIJckqcQMcul94qabbuJTn/rUJllXfX09kyZN2iTraq+pU6ey6667tjp/\n3LhxTJw4sQMrkjYPBrm0CQwfNIiIqNlr+KBB7arjkUce4eMf/zh9+/Zl22235aCDDuKJJ54A4OST\nT+b++++v5duw3tp69epF79692Wabbairq6N3795rps2bN2+964ho8Vof6X3Nn59Jm8DshQtr+gi/\nWNjiIwfWsnTpUsaMGcO1117LCSecwMqVK/ntb39Lz549a1hZ+33iE59g6dKlAMyePZsPfvCDvPHG\nG4az9DeyRy51ETNnziQiOPHEE4kIevbsyWGHHcaee+4JwOTJkznooIPWtK+rq+Paa69l5MiR9O/f\nn7POeu/uyKtXr+Yb3/gG2223HSNGjOCqq66irq6O1atXt7jtSZMmscceezBgwACOOOII5syZ066a\nm/+U67rrrmP33Xend+/ejBw5khtuuGGd9hdffDEDBgxgxIgR3H777a2u+5e//CV77703/fr1Y9So\nUTz//PPtqkkqG4Nc6iJGjhxJt27dGD9+PPfffz9LlixZp03z3u+9997LE088wfTp07ntttt44IEH\ngEqgTp06lWeeeYYnn3ySO++8s9We81133cWll17KnXfeyWuvvcZBBx3EuHHjNmofBg8ezNSpU/nr\nX//KNddcw1e/+tW1AnjWrFm8++67LFy4kOuuu47TTz+9xZvzPProo5x11llMnjyZxsZGTj31VMaO\nHdvqFxGpzAxyqYvo1asXjzzyCHV1dXzhC19g++2355hjjuG1115rdZkLLriAXr16MWTIEOrr63n6\n6acB+PnPf84555zDDjvsQJ8+fZgwYUKr67j22mu54IILGDlyJHV1dUyYMIGnn36auXPnbvA+HH30\n0QwdOhSAQw45hFGjRvHII4+smb/FFltw4YUX0r17dw499FAOO+ywFnvl1113HWeddRb77LMPEcHn\nPvc5VqxYseZ6AakrMcilLmS33XZj0qRJzJkzh2effZYFCxZw7rnnttp+4MCBa4a33nprli1bBsCC\nBQsYMmTImnnVw83Nnj2bc845h/79+9O/f38GDBhARDB//vwNrv/uu+/mYx/7GAMGDKBfv35MmzaN\nRYsWrZm/3Xbb0aNHjzXjw4YNY8GCBS3WNHHixDU19evXj0WLFm1UTdLmziCXuqiRI0cyfvx4nn32\n2Q1edocddljrKvK2znkPGTKEa6+9lsbGRhobG1m8eDHLli1j//3336Btvvnmm5x44olcdNFFLFq0\niMWLF1NfX7/WefRFixaxcuXKteoaPHhwizV9+9vfXqemsWPHblBNUhkY5FIXMWPGDH7wgx+s6XXO\nnTuXm2++mQMOOGCD13XiiSdyxRVXsGDBApYsWcJll13WatsvfelLTJw4keeeew6AN954o82L0Jo0\nv9Dtrbfe4t1332W77bYDKr3zhoaGtdqsXLmS73znO7zzzjs89NBDPPjggxx//PHrrPsLX/gCP/rR\nj9YcSl+2bBn33HMPb7/99nrrksrGn59JXUSvXr147LHH+MEPfsAbb7xB3759GTNmTKsh3Pziterx\nz3/+87z44ovstdde9OnTh7PPPpuHH36Yurq6ddqOHTuW5cuXc9JJJzFnzhz69OnDJz/5yRYDtq3t\nDxgwgO9///scffTRvPvuuxx77LEcddRRa7XZeeed6d69O4MGDaJPnz789Kc/ZdiwYeus78ADD+TK\nK6/ki1/8Ii+99BIf+MAHOPjggzn88MPbrEkqI59+Jm2glp6ANXzQIGa347feG2vYwIHMeuWVmq1/\nfe6//36+/OUv85e//KXTatgc+PQzdRaffibV2KxXXiEza/bq6BB/++23+fd//3dWrVrF/PnzueSS\nSzjuuOM6tAZJ7VPaHnln17ChttyyjrffLs9vWIcMGcicOZ3XA9ycvR96ZW+99RajRo1ixowZbLXV\nVhx99NFcfvnlbLPNNp1dWqd6P/zba/PUVo+8tEFetqoDmDats6tov/r6dS9GUoV/zN+//LdXZ/HQ\nuiRJXZRBLklSiRnkkiSVmEEuSVKJGeSSJJWYQS6pyxg3bhwTJ05scd6KFSuoq6tr8SErUpkZ5NIm\nMHToICKiZq+hQwe1q47hw4ez9dZb07t3b3r16kXv3r05++yz18xvaGigrq6O733ve7V6K/4mX/7y\nl9fU3bNnT3r06EHv3r3p3bv3Ordr3RitPVNdKjPvtS5tAnPnLqzpfQLq69t3+9eI4N5776W+vr7F\n+VOmTGHAgAFMmTKF8847r9X1rFq1im7duq132saaPXs2o0ePXueWr1dffTVXX301AJdccgkvvfQS\nU6ZM2STbBO+NoK7JHrnUxbQWVm+++Sa33347V111FS+++CJPPvnkmnmzZ8+mrq6OSZMmMWzYMA49\n9NAWp0HlyWg77LAD/fr1Y/To0WueevaHP/yBQYMGrbX9X/ziF+yzzz4t1rMxveNVq1Zx/PHHM2jQ\nIPr378+hhx7KzJkz12rzyiuvcMghh9C7d28++clPtnoo/e233+bcc89l6NChDB48mHPOOYd33nln\ng2uSOptBLr1P3HHHHfTq1YsTTjiBww8/nMmTJ6/T5je/+Q0vvPACU6dObXXakUceyUsvvcSrr77K\nRz/6UT7zmc8AsO+++7LtttvywAMPrFn2xhtvZPz48Zt0P8aOHctf/vIXXnnlFT70oQ9x+umnrzX/\nxhtv5NJLL2XRokXssssu68xv8rWvfY0FCxbwpz/9iRkzZjBz5kwuvfTSTVqr1BEMcqmLGTt2LP37\n96dfv37079+f66+/HqgcVj/ppJOICE4++WRuueUWVq1atWa5iOCSSy5hq622omfPnq1OGz9+PFtv\nvTVbbLEFF154IdOnT2fp0qUAnHbaafzsZz8DoLGxkalTpzJu3Lh1atzYQ9zdunXjlFNOYauttqJH\njx5861vf4vHHH2flypVr7f9+++1Hjx49mDhxIr/+9a95/fXX11rPqlWrmDRpEldccQW9evWiV69e\nnH/++dx8880bVZfUmQxyqYu56667aGxsZPHixTQ2NnLmmWcyb948pk2bxsknnwzApz/9ad566y3u\nvffetZbdaaed1llf9bTVq1czYcIEdtllF/r27cvOO+9MRLBo0SIATjnlFH71q1/x1ltvcdttt3Hw\nwQczcOBAAG6++eY1Xy723ntv5syZs9YXjnnz5q1331atWsU3vvENRowYQd++fdl9993JzLWCesiQ\nIWuG+/XrxzbbbLPO4fUFCxbwzjvv8OEPf5j+/fvTv39/xo4du2Y/pDIxyKUupqXe7pQpU8hMxowZ\nww477MCIESNYsWLFOofXWzpvXT3tpptu4p577uGhhx5iyZIlzJo1a82jVgEGDx7MAQccwB133MGN\nN97IqaeeumbZcePGrfly8cwzzzB06NC1vnC09CWiuRtuuIFf//rXPPzwwyxZsoQXXnhhnX2eO3fu\nmuHGxkaWL1/O4MGD11rPDjvswBZbbMFLL71EY2MjjY2NLFmyhFdffXW9NUibG4Nceh+YMmUKF198\nMU8//TTTp09n+vTp3H777dx7770sXrwYaPkLQPNpS5cupWfPnvTr14/ly5dzwQUXrBP+p556Kpdd\ndhnPPvtsm88w35jD60uXLmXLLbekX79+LFu2jH/8x39cp81dd93F448/zooVK/inf/on6uvrGTBg\nwFptunfvzhlnnMHZZ5+9pjc/d+5cHnzwwQ2uSepsBrnUxYwZM2at35EfcsghzJkzh6985Stsv/32\na15jxoxh1113XXNeeH29caicAx86dCg77rgje+65JwceeOA6yxx77LHMnj2b4447ji233LLVOjfm\nqvUzzzyTbbfdlkGDBrH33ntz8MEHr7POU045hfPPP5/tttuOGTNmrHXUoXqbl19+OYMHD2bfffel\nb9++ay7ik8rG55F3EJ9H3nW09EzqoUMHMXdu+37rvTGGDBnInDmv1Gz9m9ouu+zCddddxyGHHNLZ\npWxSPo9cnaWt55F7QxhpEyhTyNbaHXfcQV1dXZcLcWlzVfMgj4hZwBvAauCdzNwvIvoBtwLDgFnA\niZn5RtH+SuAIYDkwPjOfrnWNkjaN+vp6nn/+eW688cbOLkV63+iIHvlqYHRmLq6aNgF4MDMvi4jz\ngQuACRFxBDAiM3eNiI8B1wD7d0CNkjaBaWU6fyR1ER1xsVu0sJ1jgKYrUCYX403TpwBk5mNAn4gY\n2AE1SpJUSh0R5AlMjYjHI+JzxbSBmbkQIDNfAZrCekdgbtWy84tpkiSpBR1xaP3jmflyRGwHPBAR\nM6iEe7UNvgz04qrh0cVLkqSuoKGhgYaGhna1rXmQZ+bLxX9fi4g7gf2AhRExMDMXRsQgoOl2SvOB\nIVWL71RMW8fFtStZatOwYcN8rvX71LBhwzq7BL1PjB49mtGjR68Zv+SSS1ptW9Mgj4itgbrMXBYR\nHwAOBy4B7gbGA98t/ntXscjdwFeBWyNif2BJ0yF4aXMxa9aszi5BktaodY98IPDLiMhiW/+WmQ9E\nxB+A2yLiDGA2cCJAZt4XEUdGxJ+p/PzsszWuT5KkUvPObh3EO7tJkjZWW3d2817rkiSVmEEuSVKJ\nGeSSJJWYQS5JUokZ5JIklZhBLklSiRnkkiSVmEEuSVKJGeSSJJWYQS5JUokZ5JIklZhBLklSiRnk\nkiSVmEEuSVKJGeSSJJWYQS5JUokZ5JIklZhBLklSiRnkkiSVmEEuSVKJGeSSJJWYQS5JUokZ5JIk\nlZhBLklSiRnkkiSVmEEuSVKJGeSSJJWYQS5JUokZ5JIklZhBLklSiRnkkiSVmEEuSVKJGeSSJJWY\nQS5JUokZ5JIklZhBLklSiRnkkiSVWIcEeUTURcSTEXF3MT48Ih6NiJkRcXNEdC+m94iIWyLixYj4\nXUQM7Yj6JEkqq47qkZ8DPFc1/l3gXzJzJLAEOLOYfibQmJm7ApcDl3VQfZIklVLNgzwidgKOBP61\navIhwB3F8GRgbDF8TDEOcDtwaK3rkySpzDqiR/5D4DwgASJiALA4M1cX8+cBOxbDOwJzATJzFbAk\nIvp3QI2SJJVSTYM8Io4CFmbm00BUz2rvKjZ9VZIkdR3da7z+jwOfjogjga2AXsAVQJ+IqCt65TsB\n84v284EhwIKI6Ab0zszGllZ8cdXw6OIlSVJX0NDQQENDQ7vaRmbWtpqmDUWMAr6RmZ+OiFuBX2Tm\nrRFxNTA9M6+JiK8Ae2bmVyLiJGBsZp7Uwro6qOpNJ4Bp0zq7ivarr4eO+mxIktoWEWRmi0epO+t3\n5BOAr0fETKA/cH0x/Xpg24h4ETi3aCdJklrRYT3yTckeee3ZI5ekzcfm2COXJEmbgEEuSVKJGeSS\nJJWYQS5JUokZ5JIklZhBLklSiRnkkiSVmEEuSVKJGeSSJJWYQS5JUokZ5JIklZhBLklSiRnkkiSV\nmEEuSVKJGeSSJJWYQS5JUokZ5JIklZhBLklSiRnkkiSVmEEuSVKJGeSSJJWYQS5JUokZ5JIklZhB\nLklSiRnkkiSVmEEuSVKJGeSSJJWYQS5JUokZ5JIklZhBLklSiRnkkiSVmEEuSVKJGeSSJJWYQS5J\nUokZ5JIklZhBLklSiRnkkiSVWJtBHhFbRsTpEfHpqDg/In4VEVdExLYdVaQkSWrZ+nrkU4DDgTOA\nBmAo8GNgKfDT9a08InpGxGMR8VRE/DEiLiqmD4+IRyNiZkTcHBHdi+k9IuKWiHgxIn4XEUM3ftck\nSer6uq9n/h6ZuWcRtPMyc1Qx/f6ImL6+lWfmioioz8w3I6Ib8B8RcT/wdeBfMvPnEXE1cCZwbfHf\nxszcNSL+HrgMOGljd06SpK5ufT3ylQCZ+S6woNm8Ve3ZQGa+WQz2pPLFIYF64I5i+mRgbDF8TDEO\ncDtwaHu2IUnS+9X6euQ7RcSVQFQNU4zv2J4NREQd8AQwArgKeAlYkpmriybzqta1IzAXIDNXRcSS\niOifmY3t3SFJkt5P1hfk51UN/6HZvObjLSoC+yMR0Rv4JfCh9pdHtDbj4qrh0cVLkqSuoKGhgYaG\nhna1jcysbTXVG4v4FvAW8D+BQZm5OiL2By7KzCOK8+cXZeZjxTn1lzNz+xbW04FVbxoBTJvW2VW0\nX309dORnQ5LUuoggM1vs3LbZI4+Iu9uan5mfXs/y2wLvZOYbEbEV8EngUmAacAJwK3A6cFexyN3F\n+GPF/IfaWr8kSe936zu0fgCVc9Y3UwnXVg91t2IHYHJxnrwOuDUz74uI54FbIuI7wFPA9UX764Gf\nRcSLwOt4xbokSW1q89B6cXj7k8A4YC/gXuDmzPxTx5TXal2lO+jroXVJ0sZq69B6mz8/y8xVmXl/\nZp4O7A/8GWiIiLNqUKckSdpA6zu0TkT0BI6i0isfDlxJ5epzSZLUydZ3sdsUYE/gPuCSzHy2Q6qS\nJEntsr5z5KuB5cVodcMAMjN717C2VnmOvPY8Ry5Jm4+N/vlZZvqYU0mSNmMGtSRJJWaQS5JUYga5\nJEklZpBLklRiBrkkSSVmkEuSVGIGuSRJJWaQS5JUYga5JEklZpBLklRiBrkkSSVmkEuSVGIGuSRJ\nJWaQS5JUYga5JEklZpBLklRiBrkkSSVmkEuSVGIGuSRJJWaQS5JUYga5JEklZpBLklRiBrkkSSVm\nkEuSVGIGuSRJJWaQS5JUYga5JEklZpBLklRiBrkkSSVmkEuSVGIGuSRJJWaQS5JUYjUN8ojYKSIe\niog/RcQfI+LsYnq/iHggImZExNSI6FO1zJUR8WJEPB0R+9SyPkmSyq7WPfJ3ga9n5oeBA4CvRsSH\ngAnAg5m5G/AQcAFARBwBjMjMXYEvAtfUuD5JkkqtpkGema9k5tPF8DLgeWAn4BhgctFscjFO8d8p\nRfvHgD4RMbCWNUqSVGYddo48IoYD+wCPAgMzcyFUwh5oCusdgblVi80vpkmSpBZ0SJBHxDbA7cA5\nRc88mzVpPi5Jktqhe603EBHdqYT4zzLzrmLywogYmJkLI2IQ8GoxfT4wpGrxnYpp67i4anh08ZIk\nqStoaGigoaGhXW0js7ad4YiYAizKzK9XTfsu0JiZ342ICUDfzJwQEUcCX83MoyJif+DyzNy/hXXW\nuOpNL4Bp0zq7ivarr4dafzYkSe0TEWRmtDSvpj3yiPg48BngjxHxFJVD6P8AfBe4LSLOAGYDJwJk\n5n0RcWRE/BlYDny2lvVJklR2NQ3yzPwPoFsrsw9rZZmzaleRJEldi3d2kySpxAxySZJKzCCXJKnE\nDHJJkkrMIJckqcQMckmSSswglySpxAxySZJKzCCXJKnEDHJJkkrMIJckqcQMckmSSswglySpxAxy\nSZJKzCCXJKnEDHJJkkrMIJckqcQMckmSSswglySpxAxySZJKzCCXJKnEDHJJkkrMIJckqcQMckmS\nSswglySpxAxySZJKzCCXJKnEDHJJkkrMIJckqcQMckmSSswglySpxAxySZJKzCCXJKnEDHJJkkrM\nIJckqcQMckmSSswglySpxAxySZJKrKZBHhHXR8TCiHimalq/iHggImZExNSI6FM178qIeDEino6I\nfWpZmyRJXUGte+Q3AP+t2bQJwIOZuRvwEHABQEQcAYzIzF2BLwLX1Lg2SZJKr6ZBnpmPAIubTT4G\nmFwMTy7Gm6ZPKZZ7DOgTEQNrWZ8kSWXXGefIt8/MhQCZ+QrQFNY7AnOr2s0vpkmSpFZ07+wCgNyY\nhS6uGh5dvCRJ6goaGhpoaGhoV9vI3KgcbbeIGAbck5l7FePPA6Mzc2FEDAKmZebuEXFNMXxr0e4F\nYFRT773ZOmtc9aYXwLRpnV1F+9XXQ60/G5Kk9okIMjNamtcRh9ajeDW5GxhfDI8H7qqafhpAROwP\nLGkpxCUwHMWmAAAGY0lEQVRJ0ntqemg9Im6ictR7QETMAS4CLgV+HhFnALOBEwEy876IODIi/gws\nBz5by9okSeoKahrkmXlyK7MOa6X9WTUsR5KkLsc7u0mSVGIGuSRJJWaQS5JUYga5JEklZpBLklRi\nBrkkSSVmkEuSVGIGuSRJJWaQS5JUYga5JEklZpBLklRiBrkkSSVmkEuSVGIGuSRJJWaQS5JUYga5\nJEklZpBLklRiBrkkSSVmkEuSVGIGuSRJJWaQS5JUYga5JEklZpBLkv4mwwcNIiJK9Ro+aFBnv22b\nTPfOLkCSVG6zFy4kO7uIDRQLF3Z2CZuMPXJJkkrMIJckqcQMckmSSswglySpxAxySZJKzCCXJKnE\nDHJJkkrMIJckqcQMckmSSswglySpxAxyqROV7R7VXen+1FJX4b3WpU5UtntUd6X7U0tdhT1ySZJK\nbLML8oj4VES8EBEzI+L8zq5HkqTN2WZ1aD0i6oAfA4cCC4DHI+KuzHyhcyuTJHUl3bpBRHR2GZvE\nZhXkwH7Ai5k5GyAibgGOAQxySdIms2oVTJvW2VW0X3196/M2tyDfEZhbNT6PSrhL2gzU1ZWvFzNk\nyEDmzHmls8uQamZzC3JJm7HVq8vViwGor/dKe3Vtm1uQzweGVo3vVExbR7n6BBVtHRrZHJWt51VW\nZXuXy/Y5Bj/LHaGM73AZP8sticzN51esEdENmEHlYreXgd8D4zLz+U4tTJKkzdRm1SPPzFURcRbw\nAJWfxl1viEuS1LrNqkcuSZI2zGZ3Q5jOEhGrI+J7VePfiIgLN3AdR0TE4xHxbEQ8Ub0+aXMXEWMj\n4qmIeLJ4PRURqyLiSxHxx6LNf42Iy4vhiyLi651btVRRfFafjIinI+IPEbF/MX1Y0+e3qzLI37MC\nOC4i+m/MwhGxJ/Aj4OTM3BPYF/jzJqxvfdvv1lHbUteUmXdm5kcy86OZ+VHgJ8DDwP1QuSV8Zj6R\nmed2Zp1SK5YXn919gH8ALq2a16UPPRvk73kXuA5Yp4dRfKP7dfFN7/9GxE4tLH8e8M+Z+SJAVlzb\n1vIRcUNE/CQifhcRf46IURFxfUQ8FxGTqra/NCJ+UPT0/29EDCimT4uIH0bE74GzN/k7ovetiBgJ\nXAicStUfweIzek9V030i4j8jYkZEfK6j65SqVF843wdoXKdBRM+ImBQRzxRHTUcX0/eIiMeqevQj\niumnRcT04ujU5A7Zi41gkL8ngauAz0REr2bzfgTcUHzTu6kYb25P4IlW1t3W8n0z8wAqXyDuBv4l\nM/cA9oqIvYo2HwB+X/T0fwNcVLX8Fpm5X2b+sL07KrUlIroD/wZ8LTNb+vlnde/mvwCjgQOBCyPC\n55yqs2xVBPHzVDpl32mhzVeB1Zm5F3AyMDkiegBfAi4vjkTtC8yLiD2o9OxHZ+ZHgHM6ZC82gkFe\nJTOXAZNZ9x/sAODmYvhnwCc2cNXNl/941bym3s0fgVcy87li/E/A8GJ4NXBbMXxjs+3fuoG1SOvz\nz8CzmXl7O9relZkrM/N14CG8E6M6z5vFofXdgSOo/K1t7hNU/oaSmTOAWcBI4HfAP0bE/wSGZ+YK\n4BDg55m5uGi/pPa7sHEM8nVdAZxJpRfcpPn5lZbOtzxL5ZtcS9o6P7Oi+O/qquGm8dZ+Hli9vuVt\nrFvaIMWhxmOp9Fzao/qzGHTxc5Eqh8x8FNg2IrZdT9Mo2t8MjAHeAu5tOuROSe5zY5C/p+kfdDGV\n3u+ZVfP+ExhXDJ8C/LaF5b8PXBARu0LlSW4R8cUNWH5NDS2oA44vhj8DPNLmnkgbISL6AZOA0zLz\nzXYudkxE9Ciu2xgFPF6zAqW2rfn7GREfovJ38/VmbX5L5W9o03UgQ4AZEbFzZv4lM39E5RTnXlSO\nMB3fdAF08f/HZmmzuiFMJ6vuSfwLlR5J07SzgRsi4pvAa8Bn11k4848RcS5wc0RsVSz7q/Us31ZP\nv3mve7+I+BawEPj7VpaX/hZfBLYDri5uadrUw76ljWWeARqAAcC3M9Onk6izbBkRT/JeoJ+Wmdns\n9rw/ofL5fgZ4Bzg9M9+JiBMj4tRi2svA/8rMJRHxv4CHI+Jd4CngjA7bmw3gDWFKICKWZmbzC/Ak\nSfLQekn4bUuS1CJ75JIklZg9ckmSSswglySpxAxySZJKzCCXJKnEDHJJkkrMIJckqcT+P2gS6mtp\nr0N8AAAAAElFTkSuQmCC\n", + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "fig, ax1 = plt.subplots()\n", + "fig.set_size_inches(w=8, h=5)\n", + "rects1 = ax1.bar(np.arange(3), np.array([550, 17, 42]), 0.25, color='r')\n", + "rects2 = ax1.bar(np.arange(3) + 0.25, np.array([550, 4.1, 9]), 0.25, color='y')\n", + "_ = ax1.set_ylabel('MB')\n", + "_ = ax1.set_xticks(np.arange(3) + 0.25)\n", + "_ = ax1.set_xticklabels(('No Compr', 'Zlib', 'Blosc'))\n", + "_ = ax1.legend((rects1[0], rects2[0]), ('Single Table', 'EArray+Table'), loc=9)\n", + "_ = ax1.set_title('Size for stored datasets')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Conclusions" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The main conclusion here is that, whenever you have a lot of data to dump (typically in the form of an array), a combination of an EArray + Table is preferred instead of a single Table. The reason for this is that HDF5 can store the former arrangement more efficiently, and that fast compressors like Blosc works way better too." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The deep explanation on why using an EArray to store the raw data gives these advanatges is because we are physically (not only logically!) separating data that is highly related (like the result of some measurements) and also is homogeneous (of type Int32 in this case). This manual separation is critical for getting better compression ratios and faster speeds too, specially when using fast compressors like Blosc." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": true + }, + "source": [ + "Finally, although meaningful, this experiment is based on a pure synthetic dataset. It is always wise to use your own data in order to get your conclusions. It is specially recommended to have a look at the different compressors that comes with PyTables and see which one fits better to your needs: http://www.pytables.org/usersguide/libref/helper_classes.html" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.12" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/examples/add-column.py b/examples/add-column.py new file mode 100644 index 0000000..b707b6c --- /dev/null +++ b/examples/add-column.py @@ -0,0 +1,74 @@ +"Example showing how to add a column on a existing column" + +import tables as tb + + +class Particle(tb.IsDescription): + name = tb.StringCol(16, pos=1) # 16-character String + lati = tb.Int32Col(pos=2) # integer + longi = tb.Int32Col(pos=3) # integer + pressure = tb.Float32Col(pos=4) # float (single-precision) + temperature = tb.Float64Col(pos=5) # double (double-precision) + +# Open a file in "w"rite mode +fileh = tb.open_file("add-column.h5", mode="w") +# Create a new group +group = fileh.create_group(fileh.root, "newgroup") + +# Create a new table in newgroup group +table = fileh.create_table(group, 'table', Particle, "A table", + tb.Filters(1)) + +# Append several rows +table.append([("Particle: 10", 10, 0, 10 * 10, 10 ** 2), + ("Particle: 11", 11, -1, 11 * 11, 11 ** 2), + ("Particle: 12", 12, -2, 12 * 12, 12 ** 2)]) + +print("Contents of the original table:", fileh.root.newgroup.table[:]) + +# close the file +fileh.close() + +# Open it again in append mode +fileh = tb.open_file("add-column.h5", "a") +group = fileh.root.newgroup +table = group.table + +# Get a description of table in dictionary format +descr = table.description._v_colobjects +descr2 = descr.copy() + +# Add a column to description +descr2["hot"] = tb.BoolCol(dflt=False) + +# Create a new table with the new description +table2 = fileh.create_table(group, 'table2', descr2, "A table", + tb.Filters(1)) + +# Copy the user attributes +table.attrs._f_copy(table2) + +# Fill the rows of new table with default values +for i in range(table.nrows): + table2.row.append() +# Flush the rows to disk +table2.flush() + +# Copy the columns of source table to destination +for col in descr: + getattr(table2.cols, col)[:] = getattr(table.cols, col)[:] + +# Fill the new column +table2.cols.hot[:] = [row["temperature"] > 11 ** 2 for row in table] + +# Remove the original table +table.remove() + +# Move table2 to table +table2.move('/newgroup', 'table') + +# Print the new table +print("Contents of the table with column added:", fileh.root.newgroup.table[:]) + +# Finally, close the file +fileh.close() diff --git a/examples/array1.py b/examples/array1.py new file mode 100644 index 0000000..329840d --- /dev/null +++ b/examples/array1.py @@ -0,0 +1,50 @@ +import numpy as np +import tables as tb + +# Open a new empty HDF5 file +fileh = tb.open_file("array1.h5", mode="w") +# Get the root group +root = fileh.root + +# Create an Array +a = np.array([-1, 2, 4], np.int16) +# Save it on the HDF5 file +hdfarray = fileh.create_array(root, 'array_1', a, "Signed short array") + +# Create a scalar Array +a = np.array(4, np.int16) +# Save it on the HDF5 file +hdfarray = fileh.create_array(root, 'array_s', a, "Scalar signed short array") + +# Create a 3-d array of floats +a = np.arange(120, dtype=np.float64).reshape(20, 3, 2) +# Save it on the HDF5 file +hdfarray = fileh.create_array(root, 'array_f', a, "3-D float array") + +# Close the file +fileh.close() + +# Open the file for reading +fileh = tb.open_file("array1.h5", mode="r") +# Get the root group +root = fileh.root + +a = root.array_1.read() +print("Signed byte array -->", repr(a), a.shape) + +print("Testing iterator (works even over scalar arrays):", end=' ') +arr = root.array_s +for x in arr: + print("nrow-->", arr.nrow) + print("Element-->", repr(x)) + +# print "Testing getitem:" +# for i in range(root.array_1.nrows): +# print "array_1["+str(i)+"]", "-->", root.array_1[i] + +print("array_f[:,2:3,2::2]", repr(root.array_f[:, 2:3, 2::2])) +print("array_f[1,2:]", repr(root.array_f[1, 2:])) +print("array_f[1]", repr(root.array_f[1])) + +# Close the file +fileh.close() diff --git a/examples/array2.py b/examples/array2.py new file mode 100644 index 0000000..110c8cd --- /dev/null +++ b/examples/array2.py @@ -0,0 +1,42 @@ +import numpy as np +import tables as tb + +# Open a new empty HDF5 file +fileh = tb.open_file("array2.h5", mode="w") +# Shortcut to the root group +root = fileh.root + +# Create an array +a = np.array([1, np.e, np.pi], float) +print("About to write array:", a) +print(" with shape: ==>", a.shape) +print(" and dtype ==>", a.dtype) + +# Save it on the HDF5 file +hdfarray = fileh.create_array(root, 'carray', a, "Float array") + +# Get metadata on the previously saved array +print() +print("Info on the object:", repr(root.carray)) + +# Close the file +fileh.close() + +# Open the previous HDF5 file in read-only mode +fileh = tb.open_file("array2.h5", mode="r") +# Get the root group +root = fileh.root + +# Get metadata on the previously saved array +print() +print("Info on the object:", repr(root.carray)) + +# Get the actual array +b = root.carray.read() +print() +print("Array read from file:", b) +print(" with shape: ==>", b.shape) +print(" and dtype ==>", b.dtype) + +# Close the file +fileh.close() diff --git a/examples/array3.py b/examples/array3.py new file mode 100644 index 0000000..5795277 --- /dev/null +++ b/examples/array3.py @@ -0,0 +1,46 @@ +import numpy as np +import tables as tb + +# Open a new empty HDF5 file +fileh = tb.open_file("array3.h5", mode="w") +# Get the root group +root = fileh.root + +# Create a large array +# a = reshape(array(range(2**16), "s"), (2,) * 16) +a = np.ones((2,) * 8, np.int8) +print("About to write array a") +print(" with shape: ==>", a.shape) +print(" and dtype: ==>", a.dtype) + +# Save it on the HDF5 file +hdfarray = fileh.create_array(root, 'carray', a, "Large array") + +# Get metadata on the previously saved array +print() +print("Info on the object:", repr(root.carray)) + +# Close the file +fileh.close() + +# Open the previous HDF5 file in read-only mode +fileh = tb.open_file("array3.h5", mode="r") +# Get the root group +root = fileh.root + +# Get metadata on the previously saved array +print() +print("Getting info on retrieved /carray object:", repr(root.carray)) + +# Get the actual array +# b = fileh.readArray("/carray") +# You can obtain the same result with: +b = root.carray.read() +print() +print("Array b read from file") +print(" with shape: ==>", b.shape) +print(" with dtype: ==>", b.dtype) +# print " contents:", b + +# Close the file +fileh.close() diff --git a/examples/array4.py b/examples/array4.py new file mode 100644 index 0000000..fa92b39 --- /dev/null +++ b/examples/array4.py @@ -0,0 +1,48 @@ +import numpy as np +import tables as tb + +basedim = 4 +file = "array4.h5" +# Open a new empty HDF5 file +fileh = tb.open_file(file, mode="w") +# Get the root group +group = fileh.root +# Set the type codes to test +dtypes = [np.int8, np.uint8, np.int16, int, np.float32, float] +for i, dtype in enumerate(dtypes, 1): + # Create an array of dtype, with incrementally bigger ranges + a = np.ones((basedim,) * i, dtype) + # Save it on the HDF5 file + dsetname = f'array_{a.dtype.char}' + hdfarray = fileh.create_array(group, dsetname, a, "Large array") + print(f"Created dataset: {hdfarray}") + # Create a new group + group = fileh.create_group(group, f'group{i}') + +# Close the file +fileh.close() + +# Open the previous HDF5 file in read-only mode +fileh = tb.open_file(file, mode="r") +# Get the root group +group = fileh.root +# Get the metadata on the previosly saved arrays +for i, dtype in enumerate(dtypes, 1): + # Create an array for later comparison + a = np.ones((basedim,) * i, dtype) + # Get the dset object hangin from group + dset = getattr(group, 'array_' + a.dtype.char) + print(f"Info from dataset: {dset!r}") + # Read the actual data in array + b = dset.read() + print(f"Array b read from file. Shape ==> {b.shape}. Dtype ==> {b.dtype}") + # Test if the original and read arrays are equal + if np.allclose(a, b): + print("Good: Read array is equal to the original") + else: + print("Error: Read array and the original differs!") + # Iterate over the next group + group = getattr(group, f'group{i}') + +# Close the file +fileh.close() diff --git a/examples/attributes1.py b/examples/attributes1.py new file mode 100644 index 0000000..07096d6 --- /dev/null +++ b/examples/attributes1.py @@ -0,0 +1,32 @@ +import numpy as np +import tables as tb + +# Open a new empty HDF5 file +fileh = tb.open_file("attributes1.h5", mode="w", title="Testing attributes") +# Get the root group +root = fileh.root + +# Create an array +a = np.array([1, 2, 4], np.int32) +# Save it on the HDF5 file +hdfarray = fileh.create_array(root, 'array', a, "Integer array") + +# Assign user attributes + +# A string +hdfarray.attrs.string = "This is an example" + +# A Char +hdfarray.attrs.char = "1" + +# An integer +hdfarray.attrs.int = 12 + +# A float +hdfarray.attrs.float = 12.32 + +# A generic object +hdfarray.attrs.object = {"a": 32.1, "b": 1, "c": [1, 2]} + +# Close the file +fileh.close() diff --git a/examples/attrs-with-padding.py b/examples/attrs-with-padding.py new file mode 100644 index 0000000..f9ffe53 --- /dev/null +++ b/examples/attrs-with-padding.py @@ -0,0 +1,45 @@ +# This is an example on how to use complex columns +import numpy as np +import tables as tb + +dt = np.dtype('i4,f8', align=True) + +# Create a file with regular padding +print("attrs *with* padding:") +fileh = tb.open_file("attrs-with-padding.h5", mode="w", pytables_sys_attrs=False) +attrs = fileh.root._v_attrs +# Set some attrs +attrs.pq = np.zeros(2, dt) +attrs.qr = np.ones((2, 2), dt) +attrs.rs = np.array([(1, 2)], dt) +print("repr(attrs)-->", repr(attrs)) + +fileh.close() + +# Create a file with no padding +print("\nattrs *without* padding:") +fileh = tb.open_file("attrs-without-padding.h5", mode="w", pytables_sys_attrs=False, allow_padding=False) +attrs = fileh.root._v_attrs +# Set some attrs +attrs.pq = np.zeros(2, dt) +attrs.qr = np.ones((2, 2), dt) +attrs.rs = np.array([(1, 2)], dt) +print("repr(attrs)-->", repr(attrs)) + +fileh.close() + +print("\n ***After closing***\n") + +print("attrs *with* padding:") +fileh = tb.open_file("attrs-with-padding.h5", mode="r") +attrs = fileh.root._v_attrs +print("repr(attrs)-->", repr(attrs)) + +fileh.close() + +print("\nattrs *without* padding:") +fileh = tb.open_file("attrs-without-padding.h5", mode="r") +attrs = fileh.root._v_attrs +print("repr(attrs)-->", repr(attrs)) + +fileh.close() diff --git a/examples/carray1.py b/examples/carray1.py new file mode 100644 index 0000000..3c29ad7 --- /dev/null +++ b/examples/carray1.py @@ -0,0 +1,19 @@ +import numpy as np +import tables as tb + +fileName = 'carray1.h5' +shape = (200, 300) +atom = tb.UInt8Atom() +filters = tb.Filters(complevel=5, complib='zlib') + +h5f = tb.open_file(fileName, 'w') +ca = h5f.create_carray(h5f.root, 'carray', atom, shape, filters=filters) +# Fill a hyperslab in ``ca``. +ca[10:60, 20:70] = np.ones((50, 50)) +h5f.close() + +# Re-open and read another hyperslab +h5f = tb.open_file(fileName) +print(h5f) +print(h5f.root.carray[8:12, 18:22]) +h5f.close() diff --git a/examples/check_examples.sh b/examples/check_examples.sh new file mode 100755 index 0000000..79cd466 --- /dev/null +++ b/examples/check_examples.sh @@ -0,0 +1,59 @@ +#!/bin/sh +# +# Small script to check the example repository quickly + +# CONFIGURATION - interpreter to use +PYTHON=python + +# exit on non-zero return status +set -e + +for script in \ + add-column.py \ + array1.py \ + array2.py \ + array3.py \ + array4.py \ + attributes1.py \ + carray1.py \ + earray1.py \ + earray2.py \ + index.py \ + inmemory.py \ + links.py \ + nested1.py \ + nested-tut.py \ + particles.py \ + read_array_out_arg.py \ + split.py \ + table1.py \ + table2.py \ + table3.py \ + tutorial1-1.py \ + tutorial1-2.py \ + tutorial3-1.py \ + tutorial3-2.py \ + undo-redo.py \ + vlarray1.py \ + vlarray2.py \ + vlarray3.py \ + vlarray4.py +do + $PYTHON "$script" +done + +#TO DEBUG: +#--------- python2.7 works +#--------- python3.4 DON'T WORK +# filenodes1.py +# multiprocess_access_queues.py +# multiprocess_access_benchmarks.py +# objecttree.py +# table-tree.py + +#--------- python2.7 DON'T WORK +#--------- python3.4 DON'T WORK +# enum.py # This should always fail +# nested-iter.py # Run this after "tutorial1-1.py" (file missing) +# tutorial2.py # This should always fail at the beginning + diff --git a/examples/earray1.py b/examples/earray1.py new file mode 100644 index 0000000..e8fe6b8 --- /dev/null +++ b/examples/earray1.py @@ -0,0 +1,15 @@ +import tables as tb +import numpy as np + +fileh = tb.open_file('earray1.h5', mode='w') +a = tb.StringAtom(itemsize=8) +# Use ``a`` as the object type for the enlargeable array. +array_c = fileh.create_earray(fileh.root, 'array_c', a, (0,), "Chars") +array_c.append(np.array(['a' * 2, 'b' * 4], dtype='S8')) +array_c.append(np.array(['a' * 6, 'b' * 8, 'c' * 10], dtype='S8')) + +# Read the string ``EArray`` we have created on disk. +for s in array_c: + print(f'array_c[{array_c.nrow}] => {s!r}') +# Close the file. +fileh.close() diff --git a/examples/earray2.py b/examples/earray2.py new file mode 100644 index 0000000..5916087 --- /dev/null +++ b/examples/earray2.py @@ -0,0 +1,80 @@ +#!/usr/bin/env python3 + +"""Small example that shows how to work with extendeable arrays of different +types, strings included.""" + +import numpy as np +import tables as tb + +# Open a new empty HDF5 file +filename = "earray2.h5" +fileh = tb.open_file(filename, mode="w") +# Get the root group +root = fileh.root + +# Create an string atom +a = tb.StringAtom(itemsize=1) +# Use it as a type for the enlargeable array +hdfarray = fileh.create_earray(root, 'array_c', a, (0,), "Character array") +hdfarray.append(np.array(['a', 'b', 'c'])) +# The next is legal: +hdfarray.append(np.array(['c', 'b', 'c', 'd'])) +# but these are not: +# hdfarray.append(array([['c', 'b'], ['c', 'd']])) +# hdfarray.append(array([[1,2,3],[3,2,1]], dtype=uint8).reshape(2,1,3)) + +# Create an atom +a = tb.UInt16Atom() +hdfarray = fileh.create_earray(root, 'array_e', a, (2, 0, 3), + "Unsigned short array") + +# Create an enlargeable array +a = tb.UInt8Atom() +hdfarray = fileh.create_earray(root, 'array_b', a, (2, 0, 3), + "Unsigned byte array", + tb.Filters(complevel=1)) + +# Append an array to this table +hdfarray.append( + np.array([[1, 2, 3], [3, 2, 1]], dtype=np.uint8).reshape(2, 1, 3)) +hdfarray.append( + np.array([[1, 2, 3], [3, 2, 1], [2, 4, 6], [6, 4, 2]], + dtype=np.uint8).reshape(2, 2, 3) * 2) +# The next should give a type error: +# hdfarray.append(array([[1,0,1],[0,0,1]], dtype=Bool).reshape(2,1,3)) + +# Close the file +fileh.close() + +# Open the file for reading +fileh = tb.open_file(filename, mode="r") +# Get the root group +root = fileh.root + +a = root.array_c.read() +print("Character array -->", repr(a), a.shape) +a = root.array_e.read() +print("Empty array (yes, this is suported) -->", repr(a), a.shape) +a = root.array_b.read(step=2) +print("Int8 array, even rows (step = 2) -->", repr(a), a.shape) + +print("Testing iterator:", end=' ') +# for x in root.array_b.iterrows(step=2): +for x in root.array_b: + print("nrow-->", root.array_b.nrow) + print("Element-->", x) + +print("Testing getitem:") +for i in range(root.array_b.shape[0]): + print("array_b[" + str(i) + "]", "-->", root.array_b[i]) +# The nrows counts the growing dimension, which is different from the +# first index +for i in range(root.array_b.nrows): + print("array_b[:," + str(i) + ",:]", "-->", root.array_b[:, i, :]) +print("array_c[1:2]", repr(root.array_c[1:2])) +print("array_c[1:3]", repr(root.array_c[1:3])) +print("array_b[:]", root.array_b[:]) + +print(repr(root.array_c)) +# Close the file +fileh.close() diff --git a/examples/filenodes1.py b/examples/filenodes1.py new file mode 100644 index 0000000..f3d3910 --- /dev/null +++ b/examples/filenodes1.py @@ -0,0 +1,51 @@ +import tables as tb +h5file = tb.open_file('fnode.h5', 'w') + + +fnode = tb.nodes.filenode.new_node(h5file, where='/', name='fnode_test') + + +print(h5file.get_node_attr('/fnode_test', 'NODE_TYPE')) + + +print("This is a test text line.", file=fnode) +print("And this is another one.", file=fnode) +print(file=fnode) +fnode.write("Of course, file methods can also be used.") + +fnode.seek(0) # Go back to the beginning of file. + +for line in fnode: + print(repr(line)) + + +fnode.close() +print(fnode.closed) + + +node = h5file.root.fnode_test +fnode = filenode.open_node(node, 'a+') +print(repr(fnode.readline())) +print(fnode.tell()) +print("This is a new line.", file=fnode) +print(repr(fnode.readline())) + + +fnode.seek(0) +for line in fnode: + print(repr(line)) + + +fnode.attrs.content_type = 'text/plain; charset=us-ascii' + + +fnode.attrs.author = "Ivan Vilata i Balaguer" +fnode.attrs.creation_date = '2004-10-20T13:25:25+0200' +fnode.attrs.keywords_en = ["FileNode", "test", "metadata"] +fnode.attrs.keywords_ca = ["FileNode", "prova", "metadades"] +fnode.attrs.owner = 'ivan' +fnode.attrs.acl = {'ivan': 'rw', '@users': 'r'} + + +fnode.close() +h5file.close() diff --git a/examples/index.py b/examples/index.py new file mode 100644 index 0000000..c2e958f --- /dev/null +++ b/examples/index.py @@ -0,0 +1,35 @@ +import random +import tables as tb +print('tables.__version__', tb.__version__) + +nrows = 10_000 - 1 + + +class Distance(tb.IsDescription): + frame = tb.Int32Col(pos=0) + distance = tb.Float64Col(pos=1) + +h5file = tb.open_file('index.h5', mode='w') +table = h5file.create_table(h5file.root, 'distance_table', Distance, + 'distance table', expectedrows=nrows) +row = table.row +for i in range(nrows): + # r['frame'] = nrows-i + row['frame'] = random.randint(0, nrows) + row['distance'] = float(i ** 2) + row.append() +table.flush() + +table.cols.frame.create_index(optlevel=9, _testmode=True, _verbose=True) +# table.cols.frame.optimizeIndex(level=5, verbose=1) + +results = [r.nrow for r in table.where('frame < 2')] +print("frame<2 -->", table.read_coordinates(results)) +# print("frame<2 -->", table.get_where_list('frame < 2')) + +results = [r.nrow for r in table.where('(1 < frame) & (frame <= 5)')] +print("rows-->", results) +print("1", table.read_coordinates(results)) +# print("1", table.get_where_list('(1 < frame) & (frame <= 5)')) + +h5file.close() diff --git a/examples/inmemory.py b/examples/inmemory.py new file mode 100644 index 0000000..d31bb22 --- /dev/null +++ b/examples/inmemory.py @@ -0,0 +1,51 @@ +#!/usr/bin/env python3 + +"""inmemory.py. + +Example usage of creating in-memory HDF5 file with a specified chunksize +using PyTables 3.0.0+ + +See also Cookbook page +http://pytables.github.io/cookbook/inmemory_hdf5_files.html and available +drivers +http://pytables.github.io/usersguide/parameter_files.html#hdf5-driver-management + +""" + +import numpy as np +import tables as tb + +CHUNKY = 30 +CHUNKX = 4320 + +if __name__ == '__main__': + + # create dataset and add global attrs + file_path = 'demofile_chunk%sx%d.h5' % (CHUNKY, CHUNKX) + + with tb.open_file(file_path, 'w', + title='PyTables HDF5 In-memory example', + driver='H5FD_CORE') as h5f: + + # dummy some data + lats = np.empty([2160]) + lons = np.empty([4320]) + + # create some simple arrays + lat_node = h5f.create_array('/', 'lat', lats, title='latitude') + lon_node = h5f.create_array('/', 'lon', lons, title='longitude') + + # create a 365 x 4320 x 8640 CArray of 32bit float + shape = (5, 2160, 4320) + atom = tb.Float32Atom(dflt=np.nan) + + # chunk into daily slices and then further chunk days + sst_node = h5f.create_carray( + h5f.root, 'sst', atom, shape, chunkshape=(1, CHUNKY, CHUNKX)) + + # dummy up an ndarray + sst = np.empty([2160, 4320], dtype=np.float32) + sst.fill(30.0) + + # write ndarray to a 2D plane in the HDF5 + sst_node[0] = sst diff --git a/examples/links.py b/examples/links.py new file mode 100644 index 0000000..d3a52f8 --- /dev/null +++ b/examples/links.py @@ -0,0 +1,50 @@ +import tables as tb + +# Create a new file with some structural groups +f1 = tb.open_file('links1.h5', 'w') +g1 = f1.create_group('/', 'g1') +g2 = f1.create_group(g1, 'g2') + +# Create some datasets +a1 = f1.create_carray(g1, 'a1', tb.Int64Atom(), shape=(10_000,)) +t1 = f1.create_table(g2, 't1', {'f1': tb.IntCol(), 'f2': tb.FloatCol()}) + +# Create new group and a first hard link +gl = f1.create_group('/', 'gl') +ht = f1.create_hard_link(gl, 'ht', '/g1/g2/t1') # ht points to t1 +print(f"``{ht}`` is a hard link to: ``{t1}``") + +# Remove the orginal link to the t1 table +t1.remove() +print("table continues to be accessible in: ``%s``" % f1.get_node('/gl/ht')) + +# Let's continue with soft links +la1 = f1.create_soft_link(gl, 'la1', '/g1/a1') # la1 points to a1 +print(f"``{la1}`` is a soft link to: ``{la1.target}``") +lt = f1.create_soft_link(gl, 'lt', '/g1/g2/t1') # lt points to t1 (dangling) +print(f"``{lt}`` is a soft link to: ``{lt.target}``") + +# Recreate the '/g1/g2/t1' path +t1 = f1.create_hard_link('/g1/g2', 't1', '/gl/ht') +print(f"``{lt}`` is not dangling anymore") + +# Dereferrencing +plt = lt() +print("dereferred lt node: ``%s``" % plt) +pla1 = la1() +print("dereferred la1 node: ``%s``" % pla1) + +# Copy the array a1 into another file +f2 = tb.open_file('links2.h5', 'w') +new_a1 = a1.copy(f2.root, 'a1') +f2.close() # close the other file + +# Remove the original soft link and create an external link +la1.remove() +la1 = f1.create_external_link(gl, 'la1', 'links2.h5:/a1') +print(f"``{la1}`` is an external link to: ``{la1.target}``") +new_a1 = la1() # dereferrencing la1 returns a1 in links2.h5 +print("dereferred la1 node: ``%s``" % new_a1) +print("new_a1 file:", new_a1._v_file.filename) + +f1.close() diff --git a/examples/multiprocess_access_benchmarks.py b/examples/multiprocess_access_benchmarks.py new file mode 100644 index 0000000..be72202 --- /dev/null +++ b/examples/multiprocess_access_benchmarks.py @@ -0,0 +1,233 @@ +# Benchmark three methods of using PyTables with multiple processes, where data +# is read from a PyTables file in one process and then sent to another +# +# 1. using multiprocessing.Pipe +# 2. using a memory mapped file that's shared between two processes, passed as +# out argument to tables.Array.read. +# 3. using a Unix domain socket (this uses the "abstract namespace" and will +# work only on Linux). +# 4. using an IPv4 socket +# +# In all three cases, an array is loaded from a file in one process, sent to +# another, and then modified by incrementing each array element. This is meant +# to simulate retrieving data and then modifying it. + + +import multiprocessing +import os +import random +import select +import socket +import time +from time import perf_counter as clock + +import numpy as np +import tables as tb + + +# create a PyTables file with a single int64 array with the specified number of +# elements +def create_file(array_size): + array = np.ones(array_size, dtype='i8') + with tb.open_file('test.h5', 'w') as fobj: + array = fobj.create_array('/', 'test', array) + print('file created, size: {} MB'.format(array.size_on_disk / 1e6)) + + +# process to receive an array using a multiprocessing.Pipe connection +class PipeReceive(multiprocessing.Process): + + def __init__(self, receiver_pipe, result_send): + super().__init__() + self.receiver_pipe = receiver_pipe + self.result_send = result_send + + def run(self): + # block until something is received on the pipe + array = self.receiver_pipe.recv() + recv_timestamp = clock() + # perform an operation on the received array + array += 1 + finish_timestamp = clock() + assert(np.all(array == 2)) + # send the measured timestamps back to the originating process + self.result_send.send((recv_timestamp, finish_timestamp)) + + +def read_and_send_pipe(send_type, array_size): + # set up Pipe objects to send the actual array to the other process + # and receive the timing results from the other process + array_recv, array_send = multiprocessing.Pipe(False) + result_recv, result_send = multiprocessing.Pipe(False) + # start the other process and pause to allow it to start up + recv_process = PipeReceive(array_recv, result_send) + recv_process.start() + time.sleep(0.15) + with tb.open_file('test.h5', 'r') as fobj: + array = fobj.get_node('/', 'test') + start_timestamp = clock() + # read an array from the PyTables file and send it to the other process + output = array.read(0, array_size, 1) + array_send.send(output) + assert(np.all(output + 1 == 2)) + # receive the timestamps from the other process + recv_timestamp, finish_timestamp = result_recv.recv() + print_results(send_type, start_timestamp, recv_timestamp, finish_timestamp) + recv_process.join() + + +# process to receive an array using a shared memory mapped file +# for real use, this would require creating some protocol to specify the +# array's data type and shape +class MemmapReceive(multiprocessing.Process): + + def __init__(self, path_recv, result_send): + super().__init__() + self.path_recv = path_recv + self.result_send = result_send + + def run(self): + # block until the memmap file path is received from the other process + path = self.path_recv.recv() + # create a memmap array using the received file path + array = np.memmap(path, 'i8', 'r+') + recv_timestamp = clock() + # perform an operation on the array + array += 1 + finish_timestamp = clock() + assert(np.all(array == 2)) + # send the timing results back to the other process + self.result_send.send((recv_timestamp, finish_timestamp)) + + +def read_and_send_memmap(send_type, array_size): + # create a multiprocessing Pipe that will be used to send the memmap + # file path to the receiving process + path_recv, path_send = multiprocessing.Pipe(False) + result_recv, result_send = multiprocessing.Pipe(False) + # start the receiving process and pause to allow it to start up + recv_process = MemmapReceive(path_recv, result_send) + recv_process.start() + time.sleep(0.15) + with tb.open_file('test.h5', 'r') as fobj: + array = fobj.get_node('/', 'test') + start_timestamp = clock() + # memmap a file as a NumPy array in 'overwrite' mode + output = np.memmap('/tmp/array1', 'i8', 'w+', shape=(array_size, )) + # read an array from a PyTables file into the memmory mapped array + array.read(0, array_size, 1, out=output) + # use a multiprocessing.Pipe to send the file's path to the receiving + # process + path_send.send('/tmp/array1') + # receive the timestamps from the other process + recv_timestamp, finish_timestamp = result_recv.recv() + # because 'output' is shared between processes, all elements should now + # be equal to 2 + assert(np.all(output == 2)) + print_results(send_type, start_timestamp, recv_timestamp, finish_timestamp) + recv_process.join() + + +# process to receive an array using a socket +# for real use, this would require creating some protocol to specify the +# array's data type and shape +class SocketReceive(multiprocessing.Process): + + def __init__(self, socket_family, address, result_send, array_nbytes): + super().__init__() + self.socket_family = socket_family + self.address = address + self.result_send = result_send + self.array_nbytes = array_nbytes + + def run(self): + # create the socket, listen for a connection and use select to block + # until a connection is made + sock = socket.socket(self.socket_family, socket.SOCK_STREAM) + sock.bind(self.address) + sock.listen(1) + readable, _, _ = select.select([sock], [], []) + # accept the connection and read the sent data into a bytearray + connection = sock.accept()[0] + recv_buffer = bytearray(self.array_nbytes) + view = memoryview(recv_buffer) + bytes_recv = 0 + while bytes_recv < self.array_nbytes: + bytes_recv += connection.recv_into(view[bytes_recv:]) + # convert the bytearray into a NumPy array + array = np.frombuffer(recv_buffer, dtype='i8') + recv_timestamp = clock() + # perform an operation on the received array + array += 1 + finish_timestamp = clock() + assert(np.all(array == 2)) + # send the timestamps back to the originating process + self.result_send.send((recv_timestamp, finish_timestamp)) + connection.close() + sock.close() + + +def unix_socket_address(): + # create a Unix domain address in the abstract namespace + # this will only work on Linux + return b'\x00' + os.urandom(5) + + +def ipv4_socket_address(): + # create an IPv4 socket address + return ('127.0.0.1', random.randint(9000, 10_000)) + + +def read_and_send_socket(send_type, array_size, array_bytes, address_func, + socket_family): + address = address_func() + # start the receiving process and pause to allow it to start up + result_recv, result_send = multiprocessing.Pipe(False) + recv_process = SocketReceive(socket_family, address, result_send, + array_bytes) + recv_process.start() + time.sleep(0.15) + with tb.open_file('test.h5', 'r') as fobj: + array = fobj.get_node('/', 'test') + start_timestamp = clock() + # connect to the receiving process' socket + sock = socket.socket(socket_family, socket.SOCK_STREAM) + sock.connect(address) + # read the array from the PyTables file and send its + # data buffer to the receiving process + output = array.read(0, array_size, 1) + sock.send(output.data) + assert(np.all(output + 1 == 2)) + # receive the timestamps from the other process + recv_timestamp, finish_timestamp = result_recv.recv() + sock.close() + print_results(send_type, start_timestamp, recv_timestamp, finish_timestamp) + recv_process.join() + + +def print_results(send_type, start_timestamp, recv_timestamp, + finish_timestamp): + msg = 'type: {0}\t receive: {1:5.5f}, add:{2:5.5f}, total: {3:5.5f}' + print(msg.format(send_type, + recv_timestamp - start_timestamp, + finish_timestamp - recv_timestamp, + finish_timestamp - start_timestamp)) + + +if __name__ == '__main__': + + random.seed(os.urandom(2)) + array_num_bytes = [10**5, 10**6, 10**7, 10**8] + + for array_bytes in array_num_bytes: + array_size = array_bytes // 8 + + create_file(array_size) + read_and_send_pipe('multiproc.Pipe', array_size) + read_and_send_memmap('memmap ', array_size) + # comment out this line to run on an OS other than Linux + read_and_send_socket('Unix socket', array_size, array_bytes, + unix_socket_address, socket.AF_UNIX) + read_and_send_socket('IPv4 socket', array_size, array_bytes, + ipv4_socket_address, socket.AF_INET) + print() diff --git a/examples/multiprocess_access_queues.py b/examples/multiprocess_access_queues.py new file mode 100644 index 0000000..e30b448 --- /dev/null +++ b/examples/multiprocess_access_queues.py @@ -0,0 +1,179 @@ +"""Example showing how to access a PyTables file from multiple processes using +queues.""" + +import queue + +import multiprocessing +import random +import time +from pathlib import Path + +import numpy as np +import tables as tb + + +# this creates an HDF5 file with one array containing n rows +def make_file(file_path, n): + + with tb.open_file(file_path, 'w') as fobj: + array = fobj.create_carray('/', 'array', tb.Int64Atom(), (n, n)) + for i in range(n): + array[i, :] = i + + +# All access to the file goes through a single instance of this class. +# It contains several queues that are used to communicate with other +# processes. +# The read_queue is used for requests to read data from the HDF5 file. +# A list of result_queues is used to send data back to client processes. +# The write_queue is used for requests to modify the HDF5 file. +# One end of a pipe (shutdown) is used to signal the process to terminate. +class FileAccess(multiprocessing.Process): + + def __init__(self, h5_path, read_queue, result_queues, write_queue, + shutdown): + self.h5_path = h5_path + self.read_queue = read_queue + self.result_queues = result_queues + self.write_queue = write_queue + self.shutdown = shutdown + self.block_period = .01 + super().__init__() + + def run(self): + self.h5_file = tb.open_file(self.h5_path, 'r+') + self.array = self.h5_file.get_node('/array') + another_loop = True + while another_loop: + + # Check if the process has received the shutdown signal. + if self.shutdown.poll(): + another_loop = False + + # Check for any data requests in the read_queue. + try: + row_num, proc_num = self.read_queue.get( + True, self.block_period) + # look up the appropriate result_queue for this data processor + # instance + result_queue = self.result_queues[proc_num] + print('processor {} reading from row {}'.format(proc_num, + row_num)) + result_queue.put(self.read_data(row_num)) + another_loop = True + except queue.Empty: + pass + + # Check for any write requests in the write_queue. + try: + row_num, data = self.write_queue.get(True, self.block_period) + print('writing row', row_num) + self.write_data(row_num, data) + another_loop = True + except queue.Empty: + pass + + # close the HDF5 file before shutting down + self.h5_file.close() + + def read_data(self, row_num): + return self.array[row_num, :] + + def write_data(self, row_num, data): + self.array[row_num, :] = data + + +# This class represents a process that does work by reading and writing to the +# HDF5 file. It does this by sending requests to the FileAccess class instance +# through its read and write queues. The data results are sent back through +# the result_queue. +# Its actions are logged to a text file. +class DataProcessor(multiprocessing.Process): + + def __init__(self, read_queue, result_queue, write_queue, proc_num, + array_size, output_file): + self.read_queue = read_queue + self.result_queue = result_queue + self.write_queue = write_queue + self.proc_num = proc_num + self.array_size = array_size + self.output_file = output_file + super().__init__() + + def run(self): + self.output_file = open(self.output_file, 'w') + # read a random row from the file + row_num = random.randrange(self.array_size) + self.read_queue.put((row_num, self.proc_num)) + self.output_file.write(str(row_num) + '\n') + self.output_file.write(str(self.result_queue.get()) + '\n') + + # modify a random row to equal 11 * (self.proc_num + 1) + row_num = random.randrange(self.array_size) + new_data = (np.zeros((1, self.array_size), 'i8') + + 11 * (self.proc_num + 1)) + self.write_queue.put((row_num, new_data)) + + # pause, then read the modified row + time.sleep(0.015) + self.read_queue.put((row_num, self.proc_num)) + self.output_file.write(str(row_num) + '\n') + self.output_file.write(str(self.result_queue.get()) + '\n') + self.output_file.close() + + +# this function starts the FileAccess class instance and +# sets up all the queues used to communicate with it +def make_queues(num_processors): + read_queue = multiprocessing.Queue() + write_queue = multiprocessing.Queue() + shutdown_recv, shutdown_send = multiprocessing.Pipe(False) + result_queues = [multiprocessing.Queue() for i in range(num_processors)] + file_access = FileAccess(file_path, read_queue, result_queues, write_queue, + shutdown_recv) + file_access.start() + return read_queue, result_queues, write_queue, shutdown_send + + +if __name__ == '__main__': + # See the discussion in :issue:`790`. + multiprocessing.set_start_method('spawn') + + file_path = 'test.h5' + n = 10 + make_file(file_path, n) + + num_processors = 3 + (read_queue, result_queues, + write_queue, shutdown_send) = make_queues(num_processors) + + processors = [] + output_files = [] + for i in range(num_processors): + result_queue = result_queues[i] + output_file = str(i) + processor = DataProcessor(read_queue, result_queue, write_queue, i, n, + output_file) + processors.append(processor) + output_files.append(output_file) + + # start all DataProcessor instances + for processor in processors: + processor.start() + + # wait for all DataProcessor instances to finish + for processor in processors: + processor.join() + + # shut down the FileAccess instance + shutdown_send.send(0) + + # print out contents of log files and delete them + print() + for output_file in output_files: + print() + print(f'contents of log file {output_file}') + print(open(output_file).read()) + Path(output_file).unlink() + + Path('test.h5').unlink() diff --git a/examples/nested-tut.py b/examples/nested-tut.py new file mode 100644 index 0000000..9f0bab6 --- /dev/null +++ b/examples/nested-tut.py @@ -0,0 +1,132 @@ +"""Small example showing the use of nested types in PyTables. + +The program creates an output file, 'nested-tut.h5'. You can view it +with ptdump or any HDF5 generic utility. + +:Author: F. Alted +:Date: 2005/06/10 + +""" + +import numpy as np + +import tables as tb + +#'-**-**-**-**- The sample nested class description -**-**-**-**-**-' + + +class Info(tb.IsDescription): + """A sub-structure of Test""" + + _v_pos = 2 # The position in the whole structure + name = tb.StringCol(10) + value = tb.Float64Col(pos=0) + +colors = tb.Enum(['red', 'green', 'blue']) + + +class NestedDescr(tb.IsDescription): + """A description that has several nested columns.""" + + color = tb.EnumCol(colors, 'red', base='uint32') + info1 = Info() + + class info2(tb.IsDescription): + _v_pos = 1 + name = tb.StringCol(10) + value = tb.Float64Col(pos=0) + + class info3(tb.IsDescription): + x = tb.Float64Col(dflt=1) + y = tb.UInt8Col(dflt=1) + +print() +print('-**-**-**-**-**-**- file creation -**-**-**-**-**-**-**-') + +filename = "nested-tut.h5" + +print("Creating file:", filename) +fileh = tb.open_file(filename, "w") + +print() +print('-**-**-**-**-**- nested table creation -**-**-**-**-**-') + +table = fileh.create_table(fileh.root, 'table', NestedDescr) + +# Fill the table with some rows +row = table.row +for i in range(10): + row['color'] = colors[['red', 'green', 'blue'][i % 3]] + row['info1/name'] = "name1-%s" % i + row['info2/name'] = "name2-%s" % i + row['info2/info3/y'] = i + # All the rest will be filled with defaults + row.append() + +table.flush() # flush the row buffer to disk +print(repr(table.nrows)) + +nra = table[::4] +print(repr(nra)) +# Append some additional rows +table.append(nra) +print(repr(table.nrows)) + +# Create a new table +table2 = fileh.create_table(fileh.root, 'table2', nra) +print(repr(table2[:])) + +# Read also the info2/name values with color == colors.red +names = [x['info2/name'] for x in table if x['color'] == colors.red] + +print() +print("**** info2/name elements satisfying color == 'red':", repr(names)) + +print() +print('-**-**-**-**-**-**- table data reading & selection -**-**-**-**-**-') + +# Read the data +print() +print("**** table data contents:\n", table[:]) + +print() +print("**** table.info2 data contents:\n", repr(table.cols.info2[1:5])) + +print() +print("**** table.info2.info3 data contents:\n", + repr(table.cols.info2.info3[1:5])) + +print("**** _f_col() ****") +print(repr(table.cols._f_col('info2'))) +print(repr(table.cols._f_col('info2/info3/y'))) + +print() +print('-**-**-**-**-**-**- table metadata -**-**-**-**-**-') + +# Read description metadata +print() +print("**** table description (short):\n", repr(table.description)) +print() +print("**** more from manual, period ***") +print(repr(table.description.info1)) +print(repr(table.description.info2.info3)) +print(repr(table.description._v_nested_names)) +print(repr(table.description.info1._v_nested_names)) +print() +print("**** now some for nested records, take that ****") +print(repr(table.description._v_nested_descr)) +print(repr(np.rec.array(None, shape=0, + dtype=table.description._v_nested_descr))) +print(repr(np.rec.array(None, shape=0, + dtype=table.description.info2._v_nested_descr))) +print() +print("**** and some iteration over descriptions, too ****") +for coldescr in table.description._f_walk(): + print("column-->", coldescr) +print() +print("**** info2 sub-structure description:\n", table.description.info2) +print() +print("**** table representation (long form):\n", repr(table)) + +# Remember to always close the file +fileh.close() diff --git a/examples/nested1.py b/examples/nested1.py new file mode 100644 index 0000000..5e0ee28 --- /dev/null +++ b/examples/nested1.py @@ -0,0 +1,82 @@ +# Example to show how nested types can be dealed with PyTables +# F. Alted 2005/05/27 + +import random +import tables as tb + +fileout = "nested1.h5" + +# An example of enumerated structure +colors = tb.Enum(['red', 'green', 'blue']) + + +def read(file): + fileh = tb.open_file(file, "r") + + print("table (short)-->", fileh.root.table) + print("table (long)-->", repr(fileh.root.table)) + print("table (contents)-->", repr(fileh.root.table[:])) + + fileh.close() + + +def write(file, desc, indexed): + fileh = tb.open_file(file, "w") + table = fileh.create_table(fileh.root, 'table', desc) + for colname in indexed: + table.colinstances[colname].create_index() + + row = table.row + for i in range(10): + row['x'] = i + row['y'] = 10.2 - i + row['z'] = i + row['color'] = colors[random.choice(['red', 'green', 'blue'])] + row['info/name'] = "name%s" % i + row['info/info2/info3/z4'] = i + # All the rest will be filled with defaults + row.append() + + fileh.close() + +# The sample nested class description + + +class Info(tb.IsDescription): + _v_pos = 2 + Name = tb.UInt32Col() + Value = tb.Float64Col() + + +class Test(tb.IsDescription): + """A description that has several columns.""" + + x = tb.Int32Col(shape=2, dflt=0, pos=0) + y = tb.Float64Col(dflt=1.2, shape=(2, 3)) + z = tb.UInt8Col(dflt=1) + color = tb.EnumCol(colors, 'red', base='uint32', shape=(2,)) + Info = Info() + + class info(tb.IsDescription): + _v_pos = 1 + name = tb.StringCol(10) + value = tb.Float64Col(pos=0) + y2 = tb.Float64Col(dflt=1, shape=(2, 3), pos=1) + z2 = tb.UInt8Col(dflt=1) + + class info2(tb.IsDescription): + y3 = tb.Float64Col(dflt=1, shape=(2, 3)) + z3 = tb.UInt8Col(dflt=1) + name = tb.StringCol(10) + value = tb.EnumCol(colors, 'blue', base='uint32', shape=(1,)) + + class info3(tb.IsDescription): + name = tb.StringCol(10) + value = tb.Time64Col() + y4 = tb.Float64Col(dflt=1, shape=(2, 3)) + z4 = tb.UInt8Col(dflt=1) + +# Write the file and read it +write(fileout, Test, ['info/info2/z3']) +read(fileout) +print("You can have a look at '%s' output file now." % fileout) diff --git a/examples/objecttree.py b/examples/objecttree.py new file mode 100644 index 0000000..2c98f38 --- /dev/null +++ b/examples/objecttree.py @@ -0,0 +1,50 @@ +import tables as tb + + +class Particle(tb.IsDescription): + identity = tb.StringCol(itemsize=22, dflt=" ", pos=0) + # character String + idnumber = tb.Int16Col(dflt=1, pos=1) # short integer + speed = tb.Float32Col(dflt=1, pos=1) # single-precision + +# Open a file in "w"rite mode +fileh = tb.open_file("objecttree.h5", mode="w") +# Get the HDF5 root group +root = fileh.root + +# Create the groups: +group1 = fileh.create_group(root, "group1") +group2 = fileh.create_group(root, "group2") + +# Now, create an array in root group + +# Currently PyTables arrays don't support Unicode strings, +# so we need to make sure we pass plain bytes + +array1 = fileh.create_array( + root, "array1", [b"string", b"array"], "String array") + +# Create 2 new tables in group1 +table1 = fileh.create_table(group1, "table1", Particle) +table2 = fileh.create_table("/group2", "table2", Particle) +# Create the last table in group2 +array2 = fileh.create_array("/group1", "array2", [1, 2, 3, 4]) + +# Now, fill the tables: +for table in (table1, table2): + # Get the record object associated with the table: + row = table.row + # Fill the table with 10 records + for i in range(10): + # First, assign the values to the Particle record + row['identity'] = 'This is particle: %2d' % (i) + row['idnumber'] = i + row['speed'] = i * 2 + # This injects the Record values + row.append() + + # Flush the table buffers + table.flush() + +# Finally, close the file (this also will flush all the remaining buffers!) +fileh.close() diff --git a/examples/particles.py b/examples/particles.py new file mode 100644 index 0000000..2590352 --- /dev/null +++ b/examples/particles.py @@ -0,0 +1,121 @@ +"""Beware! you need PyTables >= 2.3 to run this script!""" + +from time import perf_counter as clock +import numpy as np +import tables as tb + +# NEVENTS = 10000 +NEVENTS = 20_000 +MAX_PARTICLES_PER_EVENT = 100 + +# Particle description + + +class Particle(tb.IsDescription): + # event_id = tables.Int32Col(pos=1, indexed=True) # event id (indexed) + event_id = tb.Int32Col(pos=1) # event id (not indexed) + particle_id = tb.Int32Col(pos=2) # particle id in the event + parent_id = tb.Int32Col(pos=3) # the id of the parent + # particle (negative + # values means no parent) + momentum = tb.Float64Col(shape=3, pos=4) # momentum of the particle + mass = tb.Float64Col(pos=5) # mass of the particle + +# Create a new table for events +t1 = clock() +print( + f"Creating a table with {NEVENTS * MAX_PARTICLES_PER_EVENT // 2} " + f"entries aprox.. Wait please...") +fileh = tb.open_file("particles-pro.h5", mode="w") +group = fileh.create_group(fileh.root, "events") +table = fileh.create_table(group, 'table', Particle, "A table", + tb.Filters(0)) +# Choose this line if you want data compression +# table = fileh.create_table(group, 'table', Particle, "A table", Filters(1)) + +# Fill the table with events +np.random.seed(1) # In order to have reproducible results +particle = table.row +for i in range(NEVENTS): + for j in range(np.random.randint(0, MAX_PARTICLES_PER_EVENT)): + particle['event_id'] = i + particle['particle_id'] = j + particle['parent_id'] = j - 10 # 10 root particles (max) + particle['momentum'] = np.random.normal(5.0, 2.0, size=3) + particle['mass'] = np.random.normal(500.0, 10.0) + # This injects the row values. + particle.append() +table.flush() +print(f"Added {table.nrows} entries --- Time: {clock() - t1:.3f} sec") + +t1 = clock() +print("Creating index...") +table.cols.event_id.create_index(optlevel=0, _verbose=True) +print(f"Index created --- Time: {clock() - t1:.3f} sec") +# Add the number of events as an attribute +table.attrs.nevents = NEVENTS + +fileh.close() + +# Open the file en read only mode and start selections +print("Selecting events...") +fileh = tb.open_file("particles-pro.h5", mode="r") +table = fileh.root.events.table + +print("Particles in event 34:", end=' ') +nrows = 0 +t1 = clock() +for row in table.where("event_id == 34"): + nrows += 1 +print(nrows) +print(f"Done --- Time: {clock() - t1:.3f} sec") + +print("Root particles in event 34:", end=' ') +nrows = 0 +t1 = clock() +for row in table.where("event_id == 34"): + if row['parent_id'] < 0: + nrows += 1 +print(nrows) +print(f"Done --- Time: {clock() - t1:.3f} sec") + +print("Sum of masses of root particles in event 34:", end=' ') +smass = 0.0 +t1 = clock() +for row in table.where("event_id == 34"): + if row['parent_id'] < 0: + smass += row['mass'] +print(smass) +print(f"Done --- Time: {clock() - t1:.3f} sec") + +print( + "Sum of masses of daughter particles for particle 3 in event 34:", end=' ') +smass = 0.0 +t1 = clock() +for row in table.where("event_id == 34"): + if row['parent_id'] == 3: + smass += row['mass'] +print(smass) +print(f"Done --- Time: {clock() - t1:.3f} sec") + +print("Sum of module of momentum for particle 3 in event 34:", end=' ') +smomentum = 0.0 +t1 = clock() +# for row in table.where("(event_id == 34) & ((parent_id) == 3)"): +for row in table.where("event_id == 34"): + if row['parent_id'] == 3: + smomentum += np.sqrt(np.add.reduce(row['momentum'] ** 2)) +print(smomentum) +print(f"Done --- Time: {clock() - t1:.3f} sec") + +# This is the same than above, but using generator expressions +# Python >= 2.4 needed here! +print("Sum of module of momentum for particle 3 in event 34 (2):", end=' ') +t1 = clock() +print(sum(np.sqrt(np.add.reduce(row['momentum'] ** 2)) + for row in table.where("event_id == 34") + if row['parent_id'] == 3)) +print(f"Done --- Time: {clock() - t1:.3f} sec") + + +fileh.close() diff --git a/examples/play-with-enums.py b/examples/play-with-enums.py new file mode 100644 index 0000000..40eb604 --- /dev/null +++ b/examples/play-with-enums.py @@ -0,0 +1,98 @@ +# Example on using enumerated types under PyTables. +# This file is intended to be run in an interactive Python session, +# since it contains some statements that raise exceptions. +# To run it, paste it as the input of ``python``. + + + +def COMMENT(string): + pass + + +COMMENT("**** Usage of the ``Enum`` class. ****") + +COMMENT("Create an enumeration of colors with automatic concrete values.") +import tables as tb +colorList = ['red', 'green', 'blue', 'white', 'black'] +colors = tb.Enum(colorList) + +COMMENT("Take a look at the name-value pairs.") +print("Colors:", [v for v in colors]) + +COMMENT("Access values as attributes.") +print("Value of 'red' and 'white':", (colors.red, colors.white)) +print("Value of 'yellow':", colors.yellow) + +COMMENT("Access values as items.") +print("Value of 'red' and 'white':", (colors['red'], colors['white'])) +print("Value of 'yellow':", colors['yellow']) + +COMMENT("Access names.") +print("Name of value %s:" % colors.red, colors(colors.red)) +print("Name of value 1234:", colors(1234)) + + +COMMENT("**** Enumerated columns. ****") + +COMMENT("Create a new PyTables file.") +h5f = tb.open_file('enum.h5', 'w') + +COMMENT("This describes a ball extraction.") + + +class BallExt(tb.IsDescription): + ballTime = tb.Time32Col() + ballColor = tb.EnumCol(colors, 'black', base='uint8') + +COMMENT("Create a table of ball extractions.") +tbl = h5f.create_table( + '/', 'extractions', BallExt, title="Random ball extractions") + +COMMENT("Simulate some ball extractions.") +import time +import random +now = time.time() +row = tbl.row +for i in range(10): + row['ballTime'] = now + i + row['ballColor'] = colors[random.choice(colorList)] # notice this + row.append() + +COMMENT("Try to append an invalid value.") +row['ballTime'] = now + 42 +row['ballColor'] = 1234 + +tbl.flush() + +COMMENT("Now print them!") +for r in tbl: + ballTime = r['ballTime'] + ballColor = colors(r['ballColor']) # notice this + print("Ball extracted on %d is of color %s." % (ballTime, ballColor)) + + +COMMENT("**** Enumerated arrays. ****") + +COMMENT("This describes a range of working days.") +workingDays = {'Mon': 1, 'Tue': 2, 'Wed': 3, 'Thu': 4, 'Fri': 5} +dayRange = tb.EnumAtom(workingDays, 'Mon', base='uint16', shape=(0, 2)) + +COMMENT("Create an EArray of day ranges within a week.") +earr = h5f.create_earray('/', 'days', dayRange, title="Working day ranges") +earr.flavor = 'python' + +COMMENT("Throw some day ranges in.") +wdays = earr.get_enum() +earr.append([(wdays.Mon, wdays.Fri), (wdays.Wed, wdays.Fri)]) + +COMMENT("The append method does not check values!") +earr.append([(wdays.Mon, 1234)]) + +COMMENT("Print the values.") +for (d1, d2) in earr: + print("From %s to %s (%d days)." % (wdays(d1), wdays(d2), d2 - d1 + 1)) + +COMMENT("Close the PyTables file and remove it.") +from pathlib import Path +h5f.close() +Path('enum.h5').unlink() diff --git a/examples/read_array_out_arg.py b/examples/read_array_out_arg.py new file mode 100644 index 0000000..feed85b --- /dev/null +++ b/examples/read_array_out_arg.py @@ -0,0 +1,58 @@ +# This script compares reading from an array in a loop using the +# tables.Array.read method. In the first case, read is used without supplying +# an 'out' argument, which causes a new output buffer to be pre-allocated +# with each call. In the second case, the buffer is created once, and then +# reused. + + + +from time import perf_counter as clock + +import numpy as np +import tables as tb + + +def create_file(array_size): + array = np.ones(array_size, dtype='i8') + with tb.open_file('test.h5', 'w') as fobj: + array = fobj.create_array('/', 'test', array) + print('file created, size: {} MB'.format(array.size_on_disk / 1e6)) + + +def standard_read(array_size): + N = 10 + with tb.open_file('test.h5', 'r') as fobj: + array = fobj.get_node('/', 'test') + start = clock() + for i in range(N): + output = array.read(0, array_size, 1) + end = clock() + assert(np.all(output == 1)) + print('standard read \t {:5.5f}'.format((end - start) / N)) + + +def pre_allocated_read(array_size): + N = 10 + with tb.open_file('test.h5', 'r') as fobj: + array = fobj.get_node('/', 'test') + start = clock() + output = np.empty(array_size, 'i8') + for i in range(N): + array.read(0, array_size, 1, out=output) + end = clock() + assert(np.all(output == 1)) + print('pre-allocated read\t {:5.5f}'.format((end - start) / N)) + + +if __name__ == '__main__': + + array_num_bytes = [10**5, 10**6, 10**7, 10**8] + + for array_bytes in array_num_bytes: + + array_size = array_bytes // 8 + + create_file(array_size) + standard_read(array_size) + pre_allocated_read(array_size) + print() diff --git a/examples/simple_threading.py b/examples/simple_threading.py new file mode 100644 index 0000000..662daf8 --- /dev/null +++ b/examples/simple_threading.py @@ -0,0 +1,101 @@ +#!/usr/bin/env python3 + +import math +import queue +import threading +from pathlib import Path + +import numpy as np +import tables as tb + + +SIZE = 100 +NTHREADS = 5 +FILENAME = 'simple_threading.h5' +H5PATH = '/array' + + +def create_test_file(filename): + data = np.random.rand(SIZE, SIZE) + + with tb.open_file(filename, 'w') as h5file: + h5file.create_array('/', 'array', title="Test Array", obj=data) + + +def chunk_generator(data_size, nchunks): + chunk_size = math.ceil(data_size / nchunks) + for start in range(0, data_size, chunk_size): + yield slice(start, start + chunk_size) + + +lock = threading.Lock() + + +def synchronized_open_file(*args, **kwargs): + with lock: + return tb.open_file(*args, **kwargs) + + +def synchronized_close_file(self, *args, **kwargs): + with lock: + return self.close(*args, **kwargs) + + +def run(filename, path, inqueue, outqueue): + try: + yslice = inqueue.get() + h5file = synchronized_open_file(filename, mode='r') + h5array = h5file.get_node(path) + data = h5array[yslice, ...] + psum = np.sum(data) + except Exception as e: + outqueue.put(e) + else: + outqueue.put(psum) + finally: + synchronized_close_file(h5file) + + +def main(): + # generate the test data + if not Path(FILENAME).exists(): + create_test_file(FILENAME) + + threads = [] + inqueue = queue.Queue() + outqueue = queue.Queue() + + # start all threads + for _ in range(NTHREADS): + thread = threading.Thread(target=run, + args=(FILENAME, H5PATH, inqueue, outqueue)) + thread.start() + threads.append(thread) + + # push requests in the input queue + for yslice in chunk_generator(SIZE, len(threads)): + inqueue.put(yslice) + + # collect results + try: + mean_ = 0 + + for _ in range(len(threads)): + out = outqueue.get() + if isinstance(out, Exception): + raise out + else: + mean_ += out + + mean_ /= SIZE * SIZE + + finally: + for thread in threads: + thread.join() + + # print results + print(f'Mean: {mean_}') + + +if __name__ == '__main__': + main() diff --git a/examples/split.py b/examples/split.py new file mode 100644 index 0000000..717b380 --- /dev/null +++ b/examples/split.py @@ -0,0 +1,39 @@ +"""Use the H5FD_SPLIT driver to store metadata and raw data in separate files. + +In this example, we store the metadata file in the current directory and +the raw data file in a subdirectory. + +""" + +import errno +from pathlib import Path + +import numpy as np +import tables as tb + +FNAME = "split" +DRIVER = "H5FD_SPLIT" +RAW_DIR = Path(__file__).with_name("raw") +DRIVER_PROPS = { + "driver_split_raw_ext": str(RAW_DIR / "%s-r.h5") +} +DATA_SHAPE = (2, 10) + + +class FooBar(tb.IsDescription): + tag = tb.StringCol(16) + data = tb.Float32Col(shape=DATA_SHAPE) + +try: + RAW_DIR.mkdir() +except OSError as e: + if e.errno == errno.EEXIST: + pass +with tb.open_file(FNAME, mode="w", driver=DRIVER, **DRIVER_PROPS) as f: + group = f.create_group("/", "foo", "foo desc") + table = f.create_table(group, "bar", FooBar, "bar desc") + for i in range(5): + table.row["tag"] = "t%d" % i + table.row["data"] = np.random.random_sample(DATA_SHAPE) + table.row.append() + table.flush() diff --git a/examples/table-tree.py b/examples/table-tree.py new file mode 100644 index 0000000..8ffc6ae --- /dev/null +++ b/examples/table-tree.py @@ -0,0 +1,300 @@ +import numpy as np +import tables as tb + + +class Particle(tb.IsDescription): + ADCcount = tb.Int16Col() # signed short integer + TDCcount = tb.UInt8Col() # unsigned byte + grid_i = tb.Int32Col() # integer + grid_j = tb.Int32Col() # integer + idnumber = tb.Int64Col() # signed long long + name = tb.StringCol(16, dflt="") # 16-character String + pressure = tb.Float32Col(shape=2) # float (single-precision) + temperature = tb.Float64Col() # double (double-precision) + +Particle2 = { + # You can also use any of the atom factories, i.e. the one which + # accepts a PyTables type. + "ADCcount": tb.Col.from_type("int16"), # signed short integer + "TDCcount": tb.Col.from_type("uint8"), # unsigned byte + "grid_i": tb.Col.from_type("int32"), # integer + "grid_j": tb.Col.from_type("int32"), # integer + "idnumber": tb.Col.from_type("int64"), # signed long long + "name": tb.Col.from_kind("string", 16), # 16-character String + "pressure": tb.Col.from_type("float32", (2,)), # float + # (single-precision) + "temperature": tb.Col.from_type("float64"), # double + # (double-precision) +} + +# The name of our HDF5 filename +filename = "table-tree.h5" + +# Open a file in "w"rite mode +h5file = tb.open_file(filename, mode="w") + +# Create a new group under "/" (root) +group = h5file.create_group("/", 'detector') + +# Create one table on it +# table = h5file.create_table(group, 'table', Particle, "Title example") +# You can choose creating a Table from a description dictionary if you wish +table = h5file.create_table(group, 'table', Particle2, "Title example") + +# Create a shortcut to the table record object +particle = table.row + +# Fill the table with 10 particles +for i in range(10): + # First, assign the values to the Particle record + particle['name'] = 'Particle: %6d' % (i) + particle['TDCcount'] = i % 256 + particle['ADCcount'] = (i * 256) % (1 << 16) + particle['grid_i'] = i + particle['grid_j'] = 10 - i + particle['pressure'] = [float(i * i), float(i * 2)] + particle['temperature'] = float(i ** 2) + particle['idnumber'] = i * (2 ** 34) # This exceeds integer range + # This injects the Record values. + particle.append() + +# Flush the buffers for table +table.flush() + +# Get actual data from table. We are interested in column pressure. +pressure = [p['pressure'] for p in table.iterrows()] +print("Last record ==>", pressure) +print("Column pressure ==>", np.array(pressure)) +print("Total records in table ==> ", len(pressure)) +print() + +# Create a new group to hold new arrays +gcolumns = h5file.create_group("/", "columns") +print("columns ==>", gcolumns, pressure) +# Create an array with this info under '/columns' having a 'list' flavor +h5file.create_array(gcolumns, 'pressure', pressure, + "Pressure column") +print("gcolumns.pressure type ==> ", gcolumns.pressure.atom.dtype) + +# Do the same with TDCcount, but with a numpy object +TDC = [p['TDCcount'] for p in table.iterrows()] +print("TDC ==>", TDC) +print("TDC shape ==>", np.array(TDC).shape) +h5file.create_array('/columns', 'TDC', np.array(TDC), "TDCcount column") + +# Do the same with name column +names = [p['name'] for p in table.iterrows()] +print("names ==>", names) +h5file.create_array('/columns', 'name', names, "Name column") +# This works even with homogeneous tuples or lists (!) +print("gcolumns.name shape ==>", gcolumns.name.shape) +print("gcolumns.name type ==> ", gcolumns.name.atom.dtype) + +print("Table dump:") +for p in table.iterrows(): + print(p) + +# Save a recarray object under detector +r = np.rec.array("a" * 300, formats='f4,3i4,a5,i2', shape=3) +recarrt = h5file.create_table("/detector", 'recarray', r, "RecArray example") +r2 = r[0:3:2] +# Change the byteorder property +recarrt = h5file.create_table("/detector", 'recarray2', r2, + "Non-contiguous recarray") +print(recarrt) +print() + +print(h5file.root.detector.table.description) +# Close the file +h5file.close() + +# sys.exit() + +# Reopen it in append mode +h5file = tb.open_file(filename, "a") + +# Ok. let's start browsing the tree from this filename +print("Reading info from filename:", h5file.filename) +print() + +# Firstly, list all the groups on tree +print("Groups in file:") +for group in h5file.walk_groups("/"): + print(group) +print() + +# List all the nodes (Group and Leaf objects) on tree +print("List of all nodes in file:") +print(h5file) + +# And finally, only the Arrays (Array objects) +print("Arrays in file:") +for array in h5file.walk_nodes("/", classname="Array"): + print(array) +print() + +# Get group /detector and print some info on it +detector = h5file.get_node("/detector") +print("detector object ==>", detector) + +# List only leaves on detector +print("Leaves in group", detector, ":") +for leaf in h5file.list_nodes("/detector", 'Leaf'): + print(leaf) +print() + +# List only tables on detector +print("Tables in group", detector, ":") +for leaf in h5file.list_nodes("/detector", 'Table'): + print(leaf) +print() + +# List only arrays on detector (there should be none!) +print("Arrays in group", detector, ":") +for leaf in h5file.list_nodes("/detector", 'Array'): + print(leaf) +print() + +# Get "/detector" Group object +group = h5file.root.detector +print("/detector ==>", group) + +# Get the "/detector/table +table = h5file.get_node("/detector/table") +print("/detector/table ==>", table) + +# Get metadata from table +print("Object:", table) +print("Table name:", table.name) +print("Table title:", table.title) +print("Rows saved on table: %d" % (table.nrows)) + +print("Variable names on table with their type:") +for name in table.colnames: + print(" ", name, ':=', table.coldtypes[name]) +print() + +# Read arrays in /columns/names and /columns/pressure + +# Get the object in "/columns pressure" +pressureObject = h5file.get_node("/columns", "pressure") + +# Get some metadata on this object +print("Info on the object:", pressureObject) +print(" shape ==>", pressureObject.shape) +print(" title ==>", pressureObject.title) +print(" type ==> ", pressureObject.atom.dtype) +print(" byteorder ==> ", pressureObject.byteorder) + +# Read the pressure actual data +pressureArray = pressureObject.read() +print(" data type ==>", type(pressureArray)) +print(" data ==>", pressureArray) +print() + +# Get the object in "/columns/names" +nameObject = h5file.root.columns.name + +# Get some metadata on this object +print("Info on the object:", nameObject) +print(" shape ==>", nameObject.shape) +print(" title ==>", nameObject.title) +print(" type ==> " % nameObject.atom.dtype) + + +# Read the 'name' actual data +nameArray = nameObject.read() +print(" data type ==>", type(nameArray)) +print(" data ==>", nameArray) + +# Print the data for both arrays +print("Data on arrays name and pressure:") +for i in range(pressureObject.shape[0]): + print("".join(nameArray[i]), "-->", pressureArray[i]) +print() + + +# Finally, append some new records to table +table = h5file.root.detector.table + +# Append 5 new particles to table (yes, tables can be enlarged!) +particle = table.row +for i in range(10, 15): + # First, assign the values to the Particle record + particle['name'] = 'Particle: %6d' % (i) + particle['TDCcount'] = i % 256 + particle['ADCcount'] = (i * 256) % (1 << 16) + particle['grid_i'] = i + particle['grid_j'] = 10 - i + particle['pressure'] = [float(i * i), float(i * 2)] + particle['temperature'] = float(i ** 2) + particle['idnumber'] = i * (2 ** 34) # This exceeds integer range + # This injects the Row values. + particle.append() + +# Flush this table +table.flush() + +print("Columns name and pressure on expanded table:") +# Print some table columns, for comparison with array data +for p in table: + print(p['name'], '-->', p['pressure']) +print() + +# Put several flavors +oldflavor = table.flavor +print(table.read(field="ADCcount")) +table.flavor = "numpy" +print(table.read(field="ADCcount")) +table.flavor = oldflavor +print(table.read(0, 0, 1, "name")) +table.flavor = "python" +print(table.read(0, 0, 1, "name")) +table.flavor = oldflavor +print(table.read(0, 0, 2, "pressure")) +table.flavor = "python" +print(table.read(0, 0, 2, "pressure")) +table.flavor = oldflavor + +# Several range selections +print("Extended slice in selection: [0:7:6]") +print(table.read(0, 7, 6)) +print("Single record in selection: [1]") +print(table.read(1)) +print("Last record in selection: [-1]") +print(table.read(-1)) +print("Two records before the last in selection: [-3:-1]") +print(table.read(-3, -1)) + +# Print a recarray in table form +table = h5file.root.detector.recarray2 +print("recarray2:", table) +print(" nrows:", table.nrows) +print(" byteorder:", table.byteorder) +print(" coldtypes:", table.coldtypes) +print(" colnames:", table.colnames) + +print(table.read()) +for p in table.iterrows(): + print(p['f1'], '-->', p['f2']) +print() + +result = [rec['f1'] for rec in table if rec.nrow < 2] +print(result) + +# Test the File.rename_node() method +# h5file.rename_node(h5file.root.detector.recarray2, "recarray3") +h5file.rename_node(table, "recarray3") +# Delete a Leaf from the HDF5 tree +h5file.remove_node(h5file.root.detector.recarray3) +# Delete the detector group and its leaves recursively +# h5file.remove_node(h5file.root.detector, recursive=1) +# Create a Group and then remove it +h5file.create_group(h5file.root, "newgroup") +h5file.remove_node(h5file.root, "newgroup") +h5file.rename_node(h5file.root.columns, "newcolumns") + +print(h5file) + +# Close this file +h5file.close() diff --git a/examples/table1.py b/examples/table1.py new file mode 100644 index 0000000..01da3e3 --- /dev/null +++ b/examples/table1.py @@ -0,0 +1,72 @@ +import tables as tb + + +class Particle(tb.IsDescription): + name = tb.StringCol(16, pos=1) # 16-character String + lati = tb.Int32Col(pos=2) # integer + longi = tb.Int32Col(pos=3) # integer + pressure = tb.Float32Col(pos=4) # float (single-precision) + temperature = tb.Float64Col(pos=5) # double (double-precision) + +# Open a file in "w"rite mode +fileh = tb.open_file("table1.h5", mode="w") +# Create a new group +group = fileh.create_group(fileh.root, "newgroup") + +# Create a new table in newgroup group +table = fileh.create_table(group, 'table', Particle, "A table", + tb.Filters(1)) +particle = table.row + +# Fill the table with 10 particles +for i in range(10): + # First, assign the values to the Particle record + particle['name'] = 'Particle: %6d' % (i) + particle['lati'] = i + particle['longi'] = 10 - i + particle['pressure'] = float(i * i) + particle['temperature'] = float(i ** 2) + # This injects the row values. + particle.append() + +# We need to flush the buffers in table in order to get an +# accurate number of records on it. +table.flush() + +# Add a couple of user attrs +table.attrs.user_attr1 = 1.023 +table.attrs.user_attr2 = "This is the second user attr" + +# Append several rows in only one call +table.append([("Particle: 10", 10, 0, 10 * 10, 10 ** 2), + ("Particle: 11", 11, -1, 11 * 11, 11 ** 2), + ("Particle: 12", 12, -2, 12 * 12, 12 ** 2)]) + +group = fileh.root.newgroup +print("Nodes under group", group, ":") +for node in fileh.list_nodes(group): + print(node) +print() + +print("Leaves everywhere in file", fileh.filename, ":") +for leaf in fileh.walk_nodes(classname="Leaf"): + print(leaf) +print() + +table = fileh.root.newgroup.table +print("Object:", table) +print(f"Table name: {table.name}. Table title: {table.title}") +print("Rows saved on table: %d" % (table.nrows)) + +print("Variable names on table with their type:") +for name in table.colnames: + print(" ", name, ':=', table.coldtypes[name]) + +print("Table contents:") +for row in table: + print(row[:]) +print("Associated recarray:") +print(table.read()) + +# Finally, close the file +fileh.close() diff --git a/examples/table2.py b/examples/table2.py new file mode 100644 index 0000000..8d46d04 --- /dev/null +++ b/examples/table2.py @@ -0,0 +1,41 @@ +# This shows how to use the cols accessors for table columns +import tables as tb + + +class Particle(tb.IsDescription): + name = tb.StringCol(16, pos=1) # 16-character String + lati = tb.Int32Col(pos=2) # integer + longi = tb.Int32Col(pos=3) # integer + vector = tb.Int32Col(shape=(2,), pos=4) # Integer + matrix2D = tb.Float64Col(shape=(2, 2), pos=5) # double (double-precision) + +# Open a file in "w"rite mode +fileh = tb.open_file("table2.h5", mode="w") +table = fileh.create_table(fileh.root, 'table', Particle, "A table") +# Append several rows in only one call +table.append( + [("Particle: 10", 10, 0, (10 * 9, 1), [[10 ** 2, 11 * 3]] * 2), + ("Particle: 11", 11, -1, + (11 * 10, 2), [[11 ** 2, 10 * 3]] * 2), + ("Particle: 12", 12, -2, + (12 * 11, 3), [[12 ** 2, 9 * 3]] * 2), + ("Particle: 13", 13, -3, + (13 * 11, 4), [[13 ** 2, 8 * 3]] * 2), + ("Particle: 14", 14, -4, (14 * 11, 5), [[14 ** 2, 7 * 3]] * 2)]) + +print("str(Cols)-->", table.cols) +print("repr(Cols)-->", repr(table.cols)) +print("Column handlers:") +for name in table.colnames: + print(table.cols._f_col(name)) + +print("Select table.cols.name[1]-->", table.cols.name[1]) +print("Select table.cols.name[1:2]-->", table.cols.name[1:2]) +print("Select table.cols.name[:]-->", table.cols.name[:]) +print("Select table.cols._f_col('name')[:]-->", table.cols._f_col('name')[:]) +print("Select table.cols.lati[1]-->", table.cols.lati[1]) +print("Select table.cols.lati[1:2]-->", table.cols.lati[1:2]) +print("Select table.cols.vector[:]-->", table.cols.vector[:]) +print("Select table.cols['matrix2D'][:]-->", table.cols.matrix2D[:]) + +fileh.close() diff --git a/examples/table3.py b/examples/table3.py new file mode 100644 index 0000000..3c8a8ef --- /dev/null +++ b/examples/table3.py @@ -0,0 +1,39 @@ +# This is an example on how to use complex columns +import tables as tb + + +class Particle(tb.IsDescription): + name = tb.StringCol(16, pos=1) # 16-character String + lati = tb.ComplexCol(itemsize=16, pos=2) + longi = tb.ComplexCol(itemsize=8, pos=3) + vector = tb.ComplexCol(itemsize=8, shape=(2,), pos=4) + matrix2D = tb.ComplexCol(itemsize=16, shape=(2, 2), pos=5) + +# Open a file in "w"rite mode +fileh = tb.open_file("table3.h5", mode="w") +table = fileh.create_table(fileh.root, 'table', Particle, "A table") +# Append several rows in only one call +table.append([ + ("Particle: 10", 10j, 0, (10 * 9 + 1j, 1), [[10 ** 2j, 11 * 3]] * 2), + ("Particle: 11", 11j, -1, (11 * 10 + 2j, 2), [[11 ** 2j, 10 * 3]] * 2), + ("Particle: 12", 12j, -2, (12 * 11 + 3j, 3), [[12 ** 2j, 9 * 3]] * 2), + ("Particle: 13", 13j, -3, (13 * 11 + 4j, 4), [[13 ** 2j, 8 * 3]] * 2), + ("Particle: 14", 14j, -4, (14 * 11 + 5j, 5), [[14 ** 2j, 7 * 3]] * 2) +]) + +print("str(Cols)-->", table.cols) +print("repr(Cols)-->", repr(table.cols)) +print("Column handlers:") +for name in table.colnames: + print(table.cols._f_col(name)) + +print("Select table.cols.name[1]-->", table.cols.name[1]) +print("Select table.cols.name[1:2]-->", table.cols.name[1:2]) +print("Select table.cols.name[:]-->", table.cols.name[:]) +print("Select table.cols._f_col('name')[:]-->", table.cols._f_col('name')[:]) +print("Select table.cols.lati[1]-->", table.cols.lati[1]) +print("Select table.cols.lati[1:2]-->", table.cols.lati[1:2]) +print("Select table.cols.vector[:]-->", table.cols.vector[:]) +print("Select table.cols['matrix2D'][:]-->", table.cols.matrix2D[:]) + +fileh.close() diff --git a/examples/tables-with-padding.py b/examples/tables-with-padding.py new file mode 100644 index 0000000..f139e16 --- /dev/null +++ b/examples/tables-with-padding.py @@ -0,0 +1,53 @@ +# This is an example on how to use complex columns +import numpy as np +import tables as tb + +N = 1000 + +padded_dtype = np.dtype([('string', 'S3'), ('int', 'i4'), ('double', 'f8')], align=True) +#assert padded_dtype.itemsize == 16 +padded_struct = np.zeros(N, padded_dtype) + +padded_struct['string'] = np.arange(N).astype('S3') +padded_struct['int'] = np.arange(N, dtype='i4') +padded_struct['double'] = np.arange(N, dtype='f8') + +# Create a file with padding (the default) +fileh = tb.open_file("tables-with-padding.h5", mode="w", pytables_sys_attrs=False) +table = fileh.create_table(fileh.root, 'table', padded_struct, "A table with padding") +print("table *with* padding -->", table) +print("table.description --> ", table.description) +print("table.descrition._v_offsets-->", table.description._v_offsets) +print("table.descrition._v_itemsize-->", table.description._v_itemsize) + +fileh.close() + +# Create another file without padding +fileh = tb.open_file("tables-without-padding.h5", mode="w", pytables_sys_attrs=False, allow_padding=False) +table = fileh.create_table(fileh.root, 'table', padded_struct, "A table without padding") +print("\ntable *without* padding -->", table) +print("table.description --> ", table.description) +print("table.descrition._v_offsets-->", table.description._v_offsets) +print("table.descrition._v_itemsize-->", table.description._v_itemsize) + +fileh.close() + +print("\n ***After closing***\n") + +fileh = tb.open_file("tables-with-padding.h5", mode="r") +table = fileh.root.table +print("table *with* padding -->", table) +print("table.description --> ", table.description) +print("table.descrition._v_offsets-->", table.description._v_offsets) +print("table.descrition._v_itemsize-->", table.description._v_itemsize) + +fileh.close() + +fileh = tb.open_file("tables-without-padding.h5", mode="r") +table = fileh.root.table +print("\ntable *without* padding -->", table) +print("table.description --> ", table.description) +print("table.descrition._v_offsets-->", table.description._v_offsets) +print("table.descrition._v_itemsize-->", table.description._v_itemsize) + +fileh.close() diff --git a/examples/threading_monkeypatch.py b/examples/threading_monkeypatch.py new file mode 100644 index 0000000..b433408 --- /dev/null +++ b/examples/threading_monkeypatch.py @@ -0,0 +1,135 @@ +#!/usr/bin/env python3 + +import math +import queue +import functools +import threading +from pathlib import Path + +import numpy as np +import tables as tb + + +class ThreadsafeFileRegistry(tb.file._FileRegistry): + lock = threading.RLock() + + @property + def handlers(self): + return self._handlers.copy() + + def add(self, handler): + with self.lock: + return super().add(handler) + + def remove(self, handler): + with self.lock: + return super().remove(handler) + + def close_all(self): + with self.lock: + return super().close_all(handler) + + +class ThreadsafeFile(tb.file.File): + def __init__(self, *args, **kargs): + with ThreadsafeFileRegistry.lock: + super().__init__(*args, **kargs) + + def close(self): + with ThreadsafeFileRegistry.lock: + super().close() + + +@functools.wraps(tb.open_file) +def synchronized_open_file(*args, **kwargs): + with ThreadsafeFileRegistry.lock: + return tb.file._original_open_file(*args, **kwargs) + + +# monkey patch the tables package +tb.file._original_open_file = tb.file.open_file +tb.file.open_file = synchronized_open_file +tb.open_file = synchronized_open_file + +tb.file._original_File = tb.file.File +tb.file.File = ThreadsafeFile +tb.File = ThreadsafeFile + +tb.file._open_files = ThreadsafeFileRegistry() + + +SIZE = 100 +NTHREADS = 5 +FILENAME = 'simple_threading.h5' +H5PATH = '/array' + + +def create_test_file(filename): + data = np.random.rand(SIZE, SIZE) + + with tb.open_file(filename, 'w') as h5file: + h5file.create_array('/', 'array', title="Test Array", obj=data) + + +def chunk_generator(data_size, nchunks): + chunk_size = math.ceil(data_size / nchunks) + for start in range(0, data_size, chunk_size): + yield slice(start, start + chunk_size) + + +def run(filename, path, inqueue, outqueue): + try: + yslice = inqueue.get() + with tb.open_file(filename, mode='r') as h5file: + h5array = h5file.get_node(path) + data = h5array[yslice, ...] + psum = np.sum(data) + except Exception as e: + outqueue.put(e) + else: + outqueue.put(psum) + + +def main(): + # generate the test data + if not Path(FILENAME).exists(): + create_test_file(FILENAME) + + threads = [] + inqueue = queue.Queue() + outqueue = queue.Queue() + + # start all threads + for i in range(NTHREADS): + thread = threading.Thread(target=run, + args=(FILENAME, H5PATH, inqueue, outqueue)) + thread.start() + threads.append(thread) + + # push requests in the input queue + for yslice in chunk_generator(SIZE, len(threads)): + inqueue.put(yslice) + + # collect results + try: + mean_ = 0 + + for _ in range(len(threads)): + out = outqueue.get() + if isinstance(out, Exception): + raise out + else: + mean_ += out + + mean_ /= SIZE * SIZE + + finally: + for thread in threads: + thread.join() + + # print results + print(f'Mean: {mean_}') + + +if __name__ == '__main__': + main() diff --git a/examples/tutorial1-1.py b/examples/tutorial1-1.py new file mode 100644 index 0000000..2cd623a --- /dev/null +++ b/examples/tutorial1-1.py @@ -0,0 +1,111 @@ +"""Small but quite comprehensive example showing the use of PyTables. + +The program creates an output file, 'tutorial1.h5'. You can view it +with any HDF5 generic utility. + +""" + +import numpy as np +import tables as tb + + + #'-**-**-**-**-**-**- user record definition -**-**-**-**-**-**-**-' + +# Define a user record to characterize some kind of particles +class Particle(tb.IsDescription): + name = tb.StringCol(16) # 16-character String + idnumber = tb.Int64Col() # Signed 64-bit integer + ADCcount = tb.UInt16Col() # Unsigned short integer + TDCcount = tb.UInt8Col() # unsigned byte + grid_i = tb.Int32Col() # integer + grid_j = tb.Int32Col() # integer + pressure = tb.Float32Col() # float (single-precision) + energy = tb.Float64Col() # double (double-precision) + +print() +print('-**-**-**-**-**-**- file creation -**-**-**-**-**-**-**-') + +# The name of our HDF5 filename +filename = "tutorial1.h5" + +print("Creating file:", filename) + +# Open a file in "w"rite mode +h5file = tb.open_file(filename, mode="w", title="Test file") + +print() +print('-**-**-**-**-**- group and table creation -**-**-**-**-**-**-**-') + +# Create a new group under "/" (root) +group = h5file.create_group("/", 'detector', 'Detector information') +print("Group '/detector' created") + +# Create one table on it +table = h5file.create_table(group, 'readout', Particle, "Readout example") +print("Table '/detector/readout' created") + +# Print the file +print(h5file) +print() +print(repr(h5file)) + +# Get a shortcut to the record object in table +particle = table.row + +# Fill the table with 10 particles +for i in range(10): + particle['name'] = 'Particle: %6d' % (i) + particle['TDCcount'] = i % 256 + particle['ADCcount'] = (i * 256) % (1 << 16) + particle['grid_i'] = i + particle['grid_j'] = 10 - i + particle['pressure'] = float(i * i) + particle['energy'] = float(particle['pressure'] ** 4) + particle['idnumber'] = i * (2 ** 34) + particle.append() + +# Flush the buffers for table +table.flush() + +print() +print('-**-**-**-**-**-**- table data reading & selection -**-**-**-**-**-') + +# Read actual data from table. We are interested in collecting pressure values +# on entries where TDCcount field is greater than 3 and pressure less than 50 +xs = [x for x in table.iterrows() + if x['TDCcount'] > 3 and 20 <= x['pressure'] < 50] +pressure = [x['pressure'] for x in xs ] +print("Last record read:") +print(repr(xs[-1])) +print("Field pressure elements satisfying the cuts:") +print(repr(pressure)) + +# Read also the names with the same cuts +names = [ + x['name'] for x in table.where( + """(TDCcount > 3) & (20 <= pressure) & (pressure < 50)""") +] +print("Field names elements satisfying the cuts:") +print(repr(names)) + +print() +print('-**-**-**-**-**-**- array object creation -**-**-**-**-**-**-**-') + +print("Creating a new group called '/columns' to hold new arrays") +gcolumns = h5file.create_group(h5file.root, "columns", "Pressure and Name") + +print("Creating an array called 'pressure' under '/columns' group") +h5file.create_array(gcolumns, 'pressure', np.array(pressure), + "Pressure column selection") +print(repr(h5file.root.columns.pressure)) + +print("Creating another array called 'name' under '/columns' group") +h5file.create_array(gcolumns, 'name', names, "Name column selection") +print(repr(h5file.root.columns.name)) + +print("HDF5 file:") +print(h5file) + +# Close the file +h5file.close() +print("File '" + filename + "' created") diff --git a/examples/tutorial1-2.py b/examples/tutorial1-2.py new file mode 100644 index 0000000..467eb8c --- /dev/null +++ b/examples/tutorial1-2.py @@ -0,0 +1,278 @@ +"""This example shows how to browse the object tree and enlarge tables. + +Before to run this program you need to execute first tutorial1-1.py +that create the tutorial1.h5 file needed here. + +""" + +import tables as tb + +print() +print('-**-**-**-**- open the previous tutorial file -**-**-**-**-**-') + +# Reopen the file in append mode +h5file = tb.open_file("tutorial1.h5", "a") + +# Print the object tree created from this filename +print("Object tree from filename:", h5file.filename) +print(h5file) + +print() +print('-**-**-**-**-**-**- traverse tree methods -**-**-**-**-**-**-**-') + +# List all the nodes (Group and Leaf objects) on tree +print(h5file) + +# List all the nodes (using File iterator) on tree +print("Nodes in file:") +for node in h5file: + print(node) +print() + +# Now, only list all the groups on tree +print("Groups in file:") +for group in h5file.walk_groups(): + print(group) +print() + +# List only the arrays hanging from / +print("Arrays in file (I):") +for group in h5file.walk_groups("/"): + for array in h5file.list_nodes(group, classname='Array'): + print(array) + +# This do the same result +print("Arrays in file (II):") +for array in h5file.walk_nodes("/", "Array"): + print(array) +print() +# And finally, list only leafs on /detector group (there should be one!) +print("Leafs in group '/detector' (I):") +for leaf in h5file.list_nodes("/detector", 'Leaf'): + print(leaf) + +# Other way using iterators and natural naming +print("Leafs in group '/detector' (II):") +for leaf in h5file.root.detector._f_walknodes('Leaf'): + print(leaf) + + +print() +print('-**-**-**-**-**-**- setting/getting object attributes -**-**--**-**-') + +# Get a pointer to '/detector/readout' node +table = h5file.root.detector.readout +# Attach it a string (date) attribute +table.attrs.gath_date = "Wed, 06/12/2003 18:33" +# Attach a floating point attribute +table.attrs.temperature = 18.4 +table.attrs.temp_scale = "Celsius" + +# Get a pointer to '/detector' node +detector = h5file.root.detector +# Attach a general object to the parent (/detector) group +detector._v_attrs.stuff = [5, (2.3, 4.5), "Integer and tuple"] + +# Now, get the attributes +print("gath_date attribute of /detector/readout:", table.attrs.gath_date) +print("temperature attribute of /detector/readout:", table.attrs.temperature) +print("temp_scale attribute of /detector/readout:", table.attrs.temp_scale) +print("stuff attribute in /detector:", detector._v_attrs.stuff) +print() + +# Delete permanently the attribute gath_date of /detector/readout +print("Deleting /detector/readout gath_date attribute") +del table.attrs.gath_date + +# Print a representation of all attributes in /detector/table +print("AttributeSet instance in /detector/table:", repr(table.attrs)) + +# Get the (user) attributes of /detector/table +print("List of user attributes in /detector/table:", table.attrs._f_list()) + +# Get the (sys) attributes of /detector/table +print("List of user attributes in /detector/table:", + table.attrs._f_list("sys")) +print() +# Rename an attribute +print("renaming 'temp_scale' attribute to 'tempScale'") +table.attrs._f_rename("temp_scale", "tempScale") +print(table.attrs._f_list()) + +# Try to rename a system attribute: +try: + table.attrs._f_rename("VERSION", "version") +except: + print("You can not rename a VERSION attribute: it is read only!.") + +print() +print('-**-**-**-**-**-**- getting object metadata -**-**-**-**-**-**-') + +# Get a pointer to '/detector/readout' data +table = h5file.root.detector.readout + +# Get metadata from table +print("Object:", table) +print("Table name:", table.name) +print("Table title:", table.title) +print("Number of rows in table:", table.nrows) +print("Table variable names with their type and shape:") +for name in table.colnames: + print(name, ':= {}, {}'.format(table.coldtypes[name], + table.coldtypes[name].shape)) +print() + +# Get the object in "/columns pressure" +pressureObject = h5file.get_node("/columns", "pressure") + +# Get some metadata on this object +print("Info on the object:", repr(pressureObject)) +print(" shape: ==>", pressureObject.shape) +print(" title: ==>", pressureObject.title) +print(" atom: ==>", pressureObject.atom) +print() +print('-**-**-**-**-**- reading actual data from arrays -**-**-**-**-**-**-') + +# Read the 'pressure' actual data +pressureArray = pressureObject.read() +print(repr(pressureArray)) +# Check the kind of object we have created (it should be a numpy array) +print("pressureArray is an object of type:", type(pressureArray)) + +# Read the 'name' Array actual data +nameArray = h5file.root.columns.name.read() +# Check the kind of object we have created (it should be a numpy array) +print("nameArray is an object of type:", type(nameArray)) + +print() + +# Print the data for both arrays +print("Data on arrays nameArray and pressureArray:") +for i in range(pressureObject.shape[0]): + print(nameArray[i], "-->", pressureArray[i]) + +print() +print('-**-**-**-**-**- reading actual data from tables -**-**-**-**-**-**-') + +# Create a shortcut to table object +table = h5file.root.detector.readout + +# Read the 'energy' column of '/detector/readout' +print("Column 'energy' of '/detector/readout':\n", table.cols.energy) +print() +# Read the 3rd row of '/detector/readout' +print("Third row of '/detector/readout':\n", table[2]) +print() +# Read the rows from 3 to 9 of row of '/detector/readout' +print("Rows from 3 to 9 of '/detector/readout':\n", table[2:9]) + +print() +print('-**-**-**-**- append records to existing table -**-**-**-**-**-') + +# Get the object row from table +table = h5file.root.detector.readout +particle = table.row + +# Append 5 new particles to table +for i in range(10, 15): + particle['name'] = 'Particle: %6d' % (i) + particle['TDCcount'] = i % 256 + particle['ADCcount'] = (i * 256) % (1 << 16) + particle['grid_i'] = i + particle['grid_j'] = 10 - i + particle['pressure'] = float(i * i) + particle['energy'] = float(particle['pressure'] ** 4) + particle['idnumber'] = i * (2 ** 34) # This exceeds long integer range + particle.append() + +# Flush this table +table.flush() + +# Print the data using the table iterator: +for r in table: + print("%-16s | %11.1f | %11.4g | %6d | %6d | %8d |" % + (r['name'], r['pressure'], r['energy'], r['grid_i'], r['grid_j'], + r['TDCcount'])) + +print() +print("Total number of entries in resulting table:", table.nrows) + +print() +print('-**-**-**-**- modify records of a table -**-**-**-**-**-') + +# Single cells +print("First row of readout table.") +print("Before modif-->", table[0]) +table.cols.TDCcount[0] = 1 +print("After modifying first row of TDCcount-->", table[0]) +table.cols.energy[0] = 2 +print("After modifying first row of energy-->", table[0]) + +# Column slices +table.cols.TDCcount[2:5] = [2, 3, 4] +print("After modifying slice [2:5] of ADCcount-->", table[0:5]) +table.cols.energy[1:9:3] = [2, 3, 4] +print("After modifying slice [1:9:3] of energy-->", table[0:9]) + +# Modifying complete Rows +table.modify_rows(start=1, step=3, + rows=[(1, 2, 3.0, 4, 5, 6, 'Particle: None', 8.0), + (2, 4, 6.0, 8, 10, 12, 'Particle: None*2', 16.0)]) +print("After modifying the complete third row-->", table[0:5]) + +# Modifying columns inside table iterators +for row in table.where('TDCcount <= 2'): + row['energy'] = row['TDCcount'] * 2 + row.update() +print("After modifying energy column (where TDCcount <=2)-->", table[0:4]) + +print() +print('-**-**-**-**- modify elements of an array -**-**-**-**-**-') + +print("pressure array") +pressureObject = h5file.root.columns.pressure +print("Before modif-->", pressureObject[:]) +pressureObject[0] = 2 +print("First modif-->", pressureObject[:]) +pressureObject[1:3] = [2.1, 3.5] +print("Second modif-->", pressureObject[:]) +pressureObject[::2] = [1, 2] +print("Third modif-->", pressureObject[:]) + +print("name array") +nameObject = h5file.root.columns.name +print("Before modif-->", nameObject[:]) +nameObject[0] = ['Particle: None'] +print("First modif-->", nameObject[:]) +nameObject[1:3] = ['Particle: 0', 'Particle: 1'] +print("Second modif-->", nameObject[:]) +nameObject[::2] = ['Particle: -3', 'Particle: -5'] +print("Third modif-->", nameObject[:]) + +print() +print('-**-**-**-**- remove records from a table -**-**-**-**-**-') + +# Delete some rows on the Table (yes, rows can be removed!) +table.remove_rows(5, 10) + +# Print some table columns, for comparison with array data +print("Some columns in final table:") +print() +# Print the headers +print("%-16s | %11s | %11s | %6s | %6s | %8s |" % + ('name', 'pressure', 'energy', 'grid_i', 'grid_j', + 'TDCcount')) + +print("%-16s + %11s + %11s + %6s + %6s + %8s +" % + ('-' * 16, '-' * 11, '-' * 11, '-' * 6, '-' * 6, '-' * 8)) +# Print the data using the table iterator: +for r in table.iterrows(): + print("%-16s | %11.1f | %11.4g | %6d | %6d | %8d |" % + (r['name'], r['pressure'], r['energy'], r['grid_i'], r['grid_j'], + r['TDCcount'])) + +print() +print("Total number of entries in final table:", table.nrows) + +# Close the file +h5file.close() diff --git a/examples/tutorial2.py b/examples/tutorial2.py new file mode 100644 index 0000000..c05b80a --- /dev/null +++ b/examples/tutorial2.py @@ -0,0 +1,105 @@ +"""This program shows the different protections that PyTables offer to the user +in order to insure a correct data injection in tables. + +Example to be used in the second tutorial in the User's Guide. + +""" + +import tables as tb +import numpy as np + +# Describe a particle record + + +class Particle(tb.IsDescription): + name = tb.StringCol(itemsize=16) # 16-character string + lati = tb.Int32Col() # integer + longi = tb.Int32Col() # integer + pressure = tb.Float32Col(shape=(2, 3)) # array of floats + # (single-precision) + temperature = tb.Float64Col(shape=(2, 3)) # array of doubles + # (double-precision) + +# Native NumPy dtype instances are also accepted +Event = np.dtype([ + ("name", "S16"), + ("TDCcount", np.uint8), + ("ADCcount", np.uint16), + ("xcoord", np.float32), + ("ycoord", np.float32) +]) + +# And dictionaries too (this defines the same structure as above) +# Event = { +# "name" : StringCol(itemsize=16), +# "TDCcount" : UInt8Col(), +# "ADCcount" : UInt16Col(), +# "xcoord" : Float32Col(), +# "ycoord" : Float32Col(), +# } + +# Open a file in "w"rite mode +fileh = tb.open_file("tutorial2.h5", mode="w") +# Get the HDF5 root group +root = fileh.root +# Create the groups: +for groupname in ("Particles", "Events"): + group = fileh.create_group(root, groupname) +# Now, create and fill the tables in Particles group +gparticles = root.Particles +# Create 3 new tables +for tablename in ("TParticle1", "TParticle2", "TParticle3"): + # Create a table + table = fileh.create_table("/Particles", tablename, Particle, + "Particles: " + tablename) + # Get the record object associated with the table: + particle = table.row + # Fill the table with 257 particles + for i in range(257): + # First, assign the values to the Particle record + particle['name'] = 'Particle: %6d' % (i) + particle['lati'] = i + particle['longi'] = 10 - i + # Detectable errors start here. Play with them! + particle['pressure'] = i * np.arange(2 * 4).reshape(2, 4) # Incorrect + # particle['pressure'] = i * arange(2 * 3).reshape(2, 3) # Correct + # End of errors + particle['temperature'] = (i ** 2) # Broadcasting + # This injects the Record values + particle.append() + # Flush the table buffers + table.flush() + +# Now, go for Events: +for tablename in ("TEvent1", "TEvent2", "TEvent3"): + # Create a table in Events group + table = fileh.create_table(root.Events, tablename, Event, + "Events: " + tablename) + # Get the record object associated with the table: + event = table.row + # Fill the table with 257 events + for i in range(257): + # First, assign the values to the Event record + event['name'] = 'Event: %6d' % (i) + event['TDCcount'] = i % (1 << 8) # Correct range + # Detectable errors start here. Play with them! + event['xcoor'] = float(i ** 2) # Wrong spelling + # event['xcoord'] = float(i**2) # Correct spelling + event['ADCcount'] = "sss" # Wrong type + # event['ADCcount'] = i * 2 # Correct type + # End of errors + event['ycoord'] = float(i) ** 4 + # This injects the Record values + event.append() + # Flush the buffers + table.flush() + +# Read the records from table "/Events/TEvent3" and select some +table = root.Events.TEvent3 +e = [p['TDCcount'] for p in table + if p['ADCcount'] < 20 and 4 <= p['TDCcount'] < 15] +print("Last record ==>", p) +print("Selected values ==>", e) +print("Total selected records ==> ", len(e)) +# Finally, close the file (this also will flush all the remaining buffers!) +fileh.close() diff --git a/examples/tutorial3-1.py b/examples/tutorial3-1.py new file mode 100644 index 0000000..daac374 --- /dev/null +++ b/examples/tutorial3-1.py @@ -0,0 +1,49 @@ +"""Small example of do/undo capability with PyTables.""" + +import tables as tb + +# Create an HDF5 file +fileh = tb.open_file("tutorial3-1.h5", "w", title="Undo/Redo demo 1") + + #'-**-**-**-**-**-**- enable undo/redo log -**-**-**-**-**-**-**-' +fileh.enable_undo() + +# Create a new array +one = fileh.create_array('/', 'anarray', [3, 4], "An array") +# Mark this point +fileh.mark() +# Create a new array +another = fileh.create_array('/', 'anotherarray', [4, 5], "Another array") +# Now undo the past operation +fileh.undo() +# Check that anotherarray does not exist in the object tree but anarray does +assert "/anarray" in fileh +assert "/anotherarray" not in fileh +# Unwind once more +fileh.undo() +# Check that anarray does not exist in the object tree +assert "/anarray" not in fileh +assert "/anotherarray" not in fileh +# Go forward up to the next marker +fileh.redo() +# Check that anarray has come back to life in a sane state +assert "/anarray" in fileh +assert fileh.root.anarray.read() == [3, 4] +assert fileh.root.anarray.title == "An array" +assert fileh.root.anarray == one +# But anotherarray is not here yet +assert "/anotherarray" not in fileh +# Now, go rewind up to the end +fileh.redo() +assert "/anarray" in fileh +# Check that anotherarray has come back to life in a sane state +assert "/anotherarray" in fileh +assert fileh.root.anotherarray.read() == [4, 5] +assert fileh.root.anotherarray.title == "Another array" +assert fileh.root.anotherarray == another + + #'-**-**-**-**-**-**- disable undo/redo log -**-**-**-**-**-**-**-' +fileh.disable_undo() + +# Close the file +fileh.close() diff --git a/examples/tutorial3-2.py b/examples/tutorial3-2.py new file mode 100644 index 0000000..c549e8c --- /dev/null +++ b/examples/tutorial3-2.py @@ -0,0 +1,79 @@ +"""A more complex example of do/undo capability with PyTables. + +Here, names has been assigned to the marks, and jumps are done between +marks. + +""" + +import tables as tb + +# Create an HDF5 file +fileh = tb.open_file('tutorial3-2.h5', 'w', title='Undo/Redo demo 2') + + #'-**-**-**-**-**-**- enable undo/redo log -**-**-**-**-**-**-**-' +fileh.enable_undo() + +# Start undoable operations +fileh.create_array('/', 'otherarray1', [3, 4], 'Another array 1') +fileh.create_group('/', 'agroup', 'Group 1') +# Create a 'first' mark +fileh.mark('first') +fileh.create_array('/agroup', 'otherarray2', [4, 5], 'Another array 2') +fileh.create_group('/agroup', 'agroup2', 'Group 2') +# Create a 'second' mark +fileh.mark('second') +fileh.create_array('/agroup/agroup2', 'otherarray3', [5, 6], 'Another array 3') +# Create a 'third' mark +fileh.mark('third') +fileh.create_array('/', 'otherarray4', [6, 7], 'Another array 4') +fileh.create_array('/agroup', 'otherarray5', [7, 8], 'Another array 5') + +# Now go to mark 'first' +fileh.goto('first') +assert '/otherarray1' in fileh +assert '/agroup' in fileh +assert '/agroup/agroup2' not in fileh +assert '/agroup/otherarray2' not in fileh +assert '/agroup/agroup2/otherarray3' not in fileh +assert '/otherarray4' not in fileh +assert '/agroup/otherarray5' not in fileh +# Go to mark 'third' +fileh.goto('third') +assert '/otherarray1' in fileh +assert '/agroup' in fileh +assert '/agroup/agroup2' in fileh +assert '/agroup/otherarray2' in fileh +assert '/agroup/agroup2/otherarray3' in fileh +assert '/otherarray4' not in fileh +assert '/agroup/otherarray5' not in fileh +# Now go to mark 'second' +fileh.goto('second') +assert '/otherarray1' in fileh +assert '/agroup' in fileh +assert '/agroup/agroup2' in fileh +assert '/agroup/otherarray2' in fileh +assert '/agroup/agroup2/otherarray3' not in fileh +assert '/otherarray4' not in fileh +assert '/agroup/otherarray5' not in fileh +# Go to the end +fileh.goto(-1) +assert '/otherarray1' in fileh +assert '/agroup' in fileh +assert '/agroup/agroup2' in fileh +assert '/agroup/otherarray2' in fileh +assert '/agroup/agroup2/otherarray3' in fileh +assert '/otherarray4' in fileh +assert '/agroup/otherarray5' in fileh +# Check that objects have come back to life in a sane state +assert fileh.root.otherarray1.read() == [3, 4] +assert fileh.root.agroup.otherarray2.read() == [4, 5] +assert fileh.root.agroup.agroup2.otherarray3.read() == [5, 6] +assert fileh.root.otherarray4.read() == [6, 7] +assert fileh.root.agroup.otherarray5.read() == [7, 8] + + + #'-**-**-**-**-**-**- disable undo/redo log -**-**-**-**-**-**-**-' +fileh.disable_undo() + +# Close the file +fileh.close() diff --git a/examples/undo-redo.py b/examples/undo-redo.py new file mode 100644 index 0000000..5e55441 --- /dev/null +++ b/examples/undo-redo.py @@ -0,0 +1,142 @@ +"""Yet another couple of examples on do/undo feauture.""" + +import tables as tb + + +def setUp(filename): + # Create an HDF5 file + fileh = tb.open_file(filename, mode="w", title="Undo/Redo demo") + # Create some nodes in there + fileh.create_group("/", "agroup", "Group 1") + fileh.create_group("/agroup", "agroup2", "Group 2") + fileh.create_array("/", "anarray", [1, 2], "Array 1") + # Enable undo/redo. + fileh.enable_undo() + return fileh + + +def tearDown(fileh): + # Disable undo/redo. + fileh.disable_undo() + # Close the file + fileh.close() + + +def demo_6times3marks(): + """Checking with six ops and three marks.""" + + # Initialize the data base with some nodes + fileh = setUp("undo-redo-6times3marks.h5") + + # Create a new array + fileh.create_array('/', 'otherarray1', [3, 4], "Another array 1") + fileh.create_array('/', 'otherarray2', [4, 5], "Another array 2") + # Put a mark + fileh.mark() + fileh.create_array('/', 'otherarray3', [5, 6], "Another array 3") + fileh.create_array('/', 'otherarray4', [6, 7], "Another array 4") + # Put a mark + fileh.mark() + fileh.create_array('/', 'otherarray5', [7, 8], "Another array 5") + fileh.create_array('/', 'otherarray6', [8, 9], "Another array 6") + # Unwind just one mark + fileh.undo() + assert "/otherarray1" in fileh + assert "/otherarray2" in fileh + assert "/otherarray3" in fileh + assert "/otherarray4" in fileh + assert "/otherarray5" not in fileh + assert "/otherarray6" not in fileh + # Unwind another mark + fileh.undo() + assert "/otherarray1" in fileh + assert "/otherarray2" in fileh + assert "/otherarray3" not in fileh + assert "/otherarray4" not in fileh + assert "/otherarray5" not in fileh + assert "/otherarray6" not in fileh + # Unwind all marks + fileh.undo() + assert "/otherarray1" not in fileh + assert "/otherarray2" not in fileh + assert "/otherarray3" not in fileh + assert "/otherarray4" not in fileh + assert "/otherarray5" not in fileh + assert "/otherarray6" not in fileh + # Redo until the next mark + fileh.redo() + assert "/otherarray1" in fileh + assert "/otherarray2" in fileh + assert "/otherarray3" not in fileh + assert "/otherarray4" not in fileh + assert "/otherarray5" not in fileh + assert "/otherarray6" not in fileh + # Redo until the next mark + fileh.redo() + assert "/otherarray1" in fileh + assert "/otherarray2" in fileh + assert "/otherarray3" in fileh + assert "/otherarray4" in fileh + assert "/otherarray5" not in fileh + assert "/otherarray6" not in fileh + # Redo until the end + fileh.redo() + assert "/otherarray1" in fileh + assert "/otherarray2" in fileh + assert "/otherarray3" in fileh + assert "/otherarray4" in fileh + assert "/otherarray5" in fileh + assert "/otherarray6" in fileh + + # Tear down the file + tearDown(fileh) + + +def demo_manyops(): + """Checking many operations together.""" + + # Initialize the data base with some nodes + fileh = setUp("undo-redo-manyops.h5") + + # Create an array + fileh.create_array(fileh.root, 'anarray3', [3], "Array title 3") + # Create a group + fileh.create_group(fileh.root, 'agroup3', "Group title 3") + # /anarray => /agroup/agroup3/ + new_node = fileh.copy_node('/anarray3', '/agroup/agroup2') + new_node = fileh.copy_children('/agroup', '/agroup3', recursive=1) + # rename anarray + fileh.rename_node('/anarray', 'anarray4') + # Move anarray + new_node = fileh.copy_node('/anarray3', '/agroup') + # Remove anarray4 + fileh.remove_node('/anarray4') + # Undo the actions + fileh.undo() + assert '/anarray4' not in fileh + assert '/anarray3' not in fileh + assert '/agroup/agroup2/anarray3' not in fileh + assert '/agroup3' not in fileh + assert '/anarray4' not in fileh + assert '/anarray' in fileh + + # Redo the actions + fileh.redo() + # Check that the copied node exists again in the object tree. + assert '/agroup/agroup2/anarray3' in fileh + assert '/agroup/anarray3' in fileh + assert '/agroup3/agroup2/anarray3' in fileh + assert '/agroup3/anarray3' not in fileh + assert fileh.root.agroup.anarray3 is new_node + assert '/anarray' not in fileh + assert '/anarray4' not in fileh + + # Tear down the file + tearDown(fileh) + + +if __name__ == '__main__': + + # run demos + demo_6times3marks() + demo_manyops() diff --git a/examples/vlarray1.py b/examples/vlarray1.py new file mode 100644 index 0000000..ea59b15 --- /dev/null +++ b/examples/vlarray1.py @@ -0,0 +1,37 @@ +import tables as tb +import numpy as np + +# Create a VLArray: +fileh = tb.open_file('vlarray1.h5', mode='w') +vlarray = fileh.create_vlarray(fileh.root, 'vlarray1', + tb.Int32Atom(shape=()), + "ragged array of ints", + filters=tb.Filters(1)) +# Append some (variable length) rows: +vlarray.append(np.array([5, 6])) +vlarray.append(np.array([5, 6, 7])) +vlarray.append([5, 6, 9, 8]) + +# Now, read it through an iterator: +print('-->', vlarray.title) +for x in vlarray: + print('%s[%d]--> %s' % (vlarray.name, vlarray.nrow, x)) + +# Now, do the same with native Python strings. +vlarray2 = fileh.create_vlarray(fileh.root, 'vlarray2', + tb.StringAtom(itemsize=2), + "ragged array of strings", + filters=tb.Filters(1)) +vlarray2.flavor = 'python' +# Append some (variable length) rows: +print('-->', vlarray2.title) +vlarray2.append(['5', '66']) +vlarray2.append(['5', '6', '77']) +vlarray2.append(['5', '6', '9', '88']) + +# Now, read it through an iterator: +for x in vlarray2: + print('%s[%d]--> %s' % (vlarray2.name, vlarray2.nrow, x)) + +# Close the file. +fileh.close() diff --git a/examples/vlarray2.py b/examples/vlarray2.py new file mode 100644 index 0000000..4ddca9d --- /dev/null +++ b/examples/vlarray2.py @@ -0,0 +1,99 @@ +#!/usr/bin/env python3 + +"""Small example that shows how to work with variable length arrays of +different types, UNICODE strings and general Python objects included.""" + +import pickle +import numpy as np +import tables as tb + +# Open a new empty HDF5 file +fileh = tb.open_file("vlarray2.h5", mode="w") +# Get the root group +root = fileh.root + +# A test with VL length arrays: +vlarray = fileh.create_vlarray(root, 'vlarray1', tb.Int32Atom(), + "ragged array of ints") +vlarray.append(np.array([5, 6])) +vlarray.append(np.array([5, 6, 7])) +vlarray.append([5, 6, 9, 8]) + +# Test with lists of bidimensional vectors +vlarray = fileh.create_vlarray(root, 'vlarray2', tb.Int64Atom(shape=(2,)), + "Ragged array of vectors") +a = np.array([[1, 2], [1, 2]], dtype=np.int64) +vlarray.append(a) +vlarray.append(np.array([[1, 2], [3, 4]], dtype=np.int64)) +vlarray.append(np.zeros(dtype=np.int64, shape=(0, 2))) +vlarray.append(np.array([[5, 6]], dtype=np.int64)) +# This makes an error (shape) +# vlarray.append(array([[5], [6]], dtype=int64)) +# This makes an error (type) +# vlarray.append(array([[5, 6]], dtype=uint64)) + +# Test with strings +vlarray = fileh.create_vlarray(root, 'vlarray3', tb.StringAtom(itemsize=3), + "Ragged array of strings") +vlarray.append(["123", "456", "3"]) +vlarray.append(["456", "3"]) +# This makes an error because of different string sizes than declared +# vlarray.append(["1234", "456", "3"]) + +# Python flavor +vlarray = fileh.create_vlarray(root, 'vlarray3b', + tb.StringAtom(itemsize=3), + "Ragged array of strings") +vlarray.flavor = "python" +vlarray.append(["123", "456", "3"]) +vlarray.append(["456", "3"]) + +# Binary strings +vlarray = fileh.create_vlarray(root, 'vlarray4', tb.UInt8Atom(), + "pickled bytes") +data = pickle.dumps((["123", "456"], "3")) +vlarray.append(np.ndarray(buffer=data, dtype=np.uint8, shape=len(data))) + +# The next is a way of doing the same than before +vlarray = fileh.create_vlarray(root, 'vlarray5', tb.ObjectAtom(), + "pickled object") +vlarray.append([["123", "456"], "3"]) + +# Boolean arrays are supported as well +vlarray = fileh.create_vlarray(root, 'vlarray6', tb.BoolAtom(), + "Boolean atoms") +# The next lines are equivalent... +vlarray.append([1, 0]) +vlarray.append([1, 0, 3, 0]) # This will be converted to a boolean +# This gives a TypeError +# vlarray.append([1,0,1]) + +# Variable length strings +vlarray = fileh.create_vlarray(root, 'vlarray7', tb.VLStringAtom(), + "Variable Length String") +vlarray.append("asd") +vlarray.append("aaana") + +# Unicode variable length strings +vlarray = fileh.create_vlarray(root, 'vlarray8', tb.VLUnicodeAtom(), + "Variable Length Unicode String") +vlarray.append("aaana") +vlarray.append("") # The empty string +vlarray.append("asd") +vlarray.append("para\u0140lel") + +# Close the file +fileh.close() + +# Open the file for reading +fileh = tb.open_file("vlarray2.h5", mode="r") +# Get the root group +root = fileh.root + +for object in fileh.list_nodes(root, "Leaf"): + arr = object.read() + print(object.name, "-->", arr) + print("number of objects in this row:", len(arr)) + +# Close the file +fileh.close() diff --git a/examples/vlarray3.py b/examples/vlarray3.py new file mode 100644 index 0000000..cd12155 --- /dev/null +++ b/examples/vlarray3.py @@ -0,0 +1,27 @@ +#!/usr/bin/env python3 + +"""Example that shows how to easily save a variable number of atoms with a +VLArray.""" + +import numpy as np +import tables as tb + +N = 100 +shape = (3, 3) + +np.random.seed(10) # For reproductible results +f = tb.open_file("vlarray3.h5", mode="w") +vlarray = f.create_vlarray(f.root, 'vlarray1', + tb.Float64Atom(shape=shape), + "ragged array of arrays") + +k = 0 +for i in range(N): + l = [] + for j in range(np.random.randint(N)): + l.append(np.random.randn(*shape)) + k += 1 + vlarray.append(l) + +print("Total number of atoms:", k) +f.close() diff --git a/examples/vlarray4.py b/examples/vlarray4.py new file mode 100644 index 0000000..f8099da --- /dev/null +++ b/examples/vlarray4.py @@ -0,0 +1,27 @@ +#!/usr/bin/env python3 + +"""Example that shows how to easily save a variable number of atoms with a +VLArray.""" + +import numpy as np +import tables as tb + +N = 100 +shape = (3, 3) + +np.random.seed(10) # For reproductible results +f = tb.open_file("vlarray4.h5", mode="w") +vlarray = f.create_vlarray(f.root, 'vlarray1', + tb.Float64Atom(shape=shape), + "ragged array of arrays") + +k = 0 +for i in range(N): + l = [] + for j in range(np.random.randint(N)): + l.append(np.random.randn(*shape)) + k += 1 + vlarray.append(l) + +print("Total number of atoms:", k) +f.close() diff --git a/hdf5-blosc/.gitignore b/hdf5-blosc/.gitignore new file mode 100644 index 0000000..31d3114 --- /dev/null +++ b/hdf5-blosc/.gitignore @@ -0,0 +1,36 @@ +# Object files +*.o +*.ko +*.obj +*.elf + +# Precompiled Headers +*.gch +*.pch + +# Libraries +*.lib +*.a +*.la +*.lo + +# Shared objects (inc. Windows DLLs) +*.dll +*.so +*.so.* +*.dylib + +# Executables +*.exe +*.out +*.app +*.i*86 +*.x86_64 +*.hex + +# Debug files +*.dSYM/ + +# Anything in the 'build' folder. +build/ + diff --git a/hdf5-blosc/.travis.yml b/hdf5-blosc/.travis.yml new file mode 100644 index 0000000..95ad44b --- /dev/null +++ b/hdf5-blosc/.travis.yml @@ -0,0 +1,22 @@ +language: c + +os: +- linux +- osx + +compiler: + - gcc + - clang + +before_install: ./travis-before-install.sh + +install: sudo apt-get install libhdf5-serial-dev + +before_script: + - mkdir build + - cd build + - cmake .. + +script: + - cmake --build . --config Release + - ctest diff --git a/hdf5-blosc/CMakeLists.txt b/hdf5-blosc/CMakeLists.txt new file mode 100644 index 0000000..8644a04 --- /dev/null +++ b/hdf5-blosc/CMakeLists.txt @@ -0,0 +1,71 @@ +cmake_minimum_required(VERSION 2.8.10) +project(blosc_hdf5) +include(ExternalProject) + +# options +option(BUILD_TESTS + "Build test programs form the blosc filter" ON) + +set(BLOSC_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/blosc") +set(BLOSC_INSTALL_DIR "${CMAKE_CURRENT_BINARY_DIR}/blosc") +set(BLOSC_CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${BLOSC_INSTALL_DIR}) + +message("BLOSC_PREFIX='${BLOSC_PREFIX}'") +message("BLOSC_INSTALL_DIR='${BLOSC_INSTALL_DIR}'") +message("BLOSC_CMAKE_ARGS='${BLOSC_CMAKE_ARGS}'") +message("GIT_EXECUTABLE='${GIT_EXECUTABLE}'") + +ExternalProject_Add(blosc + PREFIX ${BLOSC_PREFIX} + GIT_REPOSITORY https://github.com/Blosc/c-blosc.git + INSTALL_DIR ${BLOSC_INSTALL_DIR} + CMAKE_ARGS ${BLOSC_CMAKE_ARGS} +) + + +# sources +set(SOURCES src/blosc_filter.c) + +# dependencies +if(MSVC) + # FindHDF5.cmake does not find Windows installations. Try to + # use an environment variable instead until the official "find" + # file can be updated for Windows. + # + # Note that you have to set this environment variable by hand. + file(TO_CMAKE_PATH "$ENV{HDF5_DIR}" HDF5_HINT) + set(HDF5_DIR ${HDF5_HINT} CACHE STRING "Path to HDF5 CMake config directory.") + find_package(HDF5 REQUIRED HINTS ${HDF5_DIR}) +else(MSVC) + find_package(HDF5 REQUIRED) +endif(MSVC) +include_directories(${HDF5_INCLUDE_DIRS}) + + +# add blosc libraries +add_library(blosc_shared SHARED IMPORTED) +set_property(TARGET blosc_shared PROPERTY IMPORTED_LOCATION ${BLOSC_INSTALL_DIR}/lib/${CMAKE_SHARED_LIBRARY_PREFIX}blosc${CMAKE_SHARED_LIBRARY_SUFFIX}) +add_dependencies(blosc_shared project_blosc) +include_directories(${BLOSC_INSTALL_DIR}/include) + +add_library(blosc_filter_shared SHARED ${SOURCES}) +set_target_properties( + blosc_filter_shared PROPERTIES OUTPUT_NAME blosc_filter) +target_link_libraries(blosc_filter_shared blosc_shared ${HDF5_LIBRARIES}) + +# install +install(FILES src/blosc_filter.h DESTINATION include COMPONENT HDF5_FILTER_DEV) +install(TARGETS blosc_filter_shared DESTINATION lib COMPONENT HDF5_FILTER_DEV) + + +# test +message("LINK LIBRARIES='blosc_filter_shared ${HDF5_LIBRARIES}'") +if(BUILD_TESTS) + enable_testing() + set(CMAKE_THREAD_PREFER_PTHREAD TRUE) + find_package(Threads REQUIRED) + set(LIBS ${LIBS} ${CMAKE_THREAD_LIBS_INIT}) + add_executable(example src/example.c) + target_link_libraries(example blosc_filter_shared ${HDF5_LIBRARIES} ${LIBS}) + add_test(test_hdf5_filter example) +endif(BUILD_TESTS) diff --git a/hdf5-blosc/LICENSES/BLOSC.txt b/hdf5-blosc/LICENSES/BLOSC.txt new file mode 100644 index 0000000..55b1392 --- /dev/null +++ b/hdf5-blosc/LICENSES/BLOSC.txt @@ -0,0 +1,21 @@ +Blosc - A blocking, shuffling and lossless compression library + +Copyright (C) 2009-2015 Francesc Alted + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. diff --git a/hdf5-blosc/LICENSES/BLOSC_HDF5.txt b/hdf5-blosc/LICENSES/BLOSC_HDF5.txt new file mode 100644 index 0000000..9c4c145 --- /dev/null +++ b/hdf5-blosc/LICENSES/BLOSC_HDF5.txt @@ -0,0 +1,21 @@ +Blosc for HDF5 - An HDF5 filter that uses the Blosc compressor. + +Copyright (C) 2009-2015 Francesc Alted + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. diff --git a/hdf5-blosc/LICENSES/H5PY.txt b/hdf5-blosc/LICENSES/H5PY.txt new file mode 100644 index 0000000..15b30f2 --- /dev/null +++ b/hdf5-blosc/LICENSES/H5PY.txt @@ -0,0 +1,34 @@ +Copyright Notice and Statement for the h5py Project + +Copyright (c) 2008 Andrew Collette +http://h5py.alfven.org +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +a. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + +b. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the + distribution. + +c. Neither the name of the author nor the names of contributors may + be used to endorse or promote products derived from this software + without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + diff --git a/hdf5-blosc/README.rst b/hdf5-blosc/README.rst new file mode 100644 index 0000000..147c99d --- /dev/null +++ b/hdf5-blosc/README.rst @@ -0,0 +1,69 @@ +===================== +Blosc filter for HDF5 +===================== + +:Travis CI: |travis| +:And...: |powered| + +.. |travis| image:: https://travis-ci.org/Blosc/hdf5.png?branch=master + :target: https://travis-ci.org/Blosc/hdf5 + +.. |powered| image:: http://b.repl.ca/v1/Powered--By-Blosc-blue.png + :target: https://blosc.org + +This is an example of filter for HDF5 that uses the Blosc compressor. + +You need to be a bit careful before using this filter because you +should not activate the shuffle right in HDF5, but rather from Blosc +itself. This is because Blosc uses an SIMD shuffle internally which +is much faster. + + +Using the Blosc filter from HDF5 +================================ + +In order to register Blosc into your HDF5 application, you only need +to call a function in blosc_filter.h, with the following signature: + + int register_blosc(char **version, char **date) + +Calling this will register the filter with the HDF5 library and will +return info about the Blosc release in `**version` and `**date` +char pointers. + +A non-negative return value indicates success. If the registration +fails, an error is pushed onto the current error stack and a negative +value is returned. + +An example C program ('src/example.c') is included which demonstrates +the proper use of the filter. + +This filter has been tested against HDF5 versions 1.6.5 through +1.8.10. It is released under the MIT license (see LICENSE.txt for +details). + + +Compiling +========= + +The filter consists of a single 'src/blosc_filter.c' source file and +'src/blosc_filter.h' header, which will need the Blosc library +installed to work. + + +As an HDF5 plugin +================= + +Also, you can use blosc as an HDF5 plugin; see 'src/blosc_plugin.c' for +details. + + +Acknowledgments +=============== + +See THANKS.rst. + + +---- + + **Enjoy data!** diff --git a/hdf5-blosc/src/blosc_filter.c b/hdf5-blosc/src/blosc_filter.c new file mode 100644 index 0000000..ba3ebed --- /dev/null +++ b/hdf5-blosc/src/blosc_filter.c @@ -0,0 +1,271 @@ +/* + Copyright (C) 2010-2016 Francesc Alted + http://blosc.org + License: MIT (see LICENSE.txt) + + Filter program that allows the use of the Blosc filter in HDF5. + + This is based on the LZF filter interface (http://h5py.alfven.org) + by Andrew Collette. + +*/ + + +#include +#include +#include +#include +#include "hdf5.h" +#include "blosc_filter.h" + +#if defined(__GNUC__) +#define PUSH_ERR(func, minor, str, ...) H5Epush(H5E_DEFAULT, __FILE__, func, __LINE__, H5E_ERR_CLS, H5E_PLINE, minor, str, ##__VA_ARGS__) +#elif defined(_MSC_VER) +#define PUSH_ERR(func, minor, str, ...) H5Epush(H5E_DEFAULT, __FILE__, func, __LINE__, H5E_ERR_CLS, H5E_PLINE, minor, str, __VA_ARGS__) +#else +/* This version is portable but it's better to use compiler-supported + approaches for handling the trailing comma issue when possible. */ +#define PUSH_ERR(func, minor, ...) H5Epush(H5E_DEFAULT, __FILE__, func, __LINE__, H5E_ERR_CLS, H5E_PLINE, minor, __VA_ARGS__) +#endif /* defined(__GNUC__) */ + +#define GET_FILTER(a,b,c,d,e,f,g) H5Pget_filter_by_id(a,b,c,d,e,f,g,NULL) + + +size_t blosc_filter(unsigned flags, size_t cd_nelmts, + const unsigned cd_values[], size_t nbytes, + size_t *buf_size, void **buf); + +herr_t blosc_set_local(hid_t dcpl, hid_t type, hid_t space); + + +/* Register the filter, passing on the HDF5 return value */ +int register_blosc(char **version, char **date){ + + int retval; + + H5Z_class_t filter_class = { + H5Z_CLASS_T_VERS, + (H5Z_filter_t)(FILTER_BLOSC), + 1, 1, + "blosc", + NULL, + (H5Z_set_local_func_t)(blosc_set_local), + (H5Z_func_t)(blosc_filter) + }; + + retval = H5Zregister(&filter_class); + if(retval<0){ + PUSH_ERR("register_blosc", H5E_CANTREGISTER, "Can't register Blosc filter"); + } + *version = strdup(BLOSC_VERSION_STRING); + *date = strdup(BLOSC_VERSION_DATE); + return 1; /* lib is available */ +} + +/* Filter setup. Records the following inside the DCPL: + + 1. If version information is not present, set slots 0 and 1 to the filter + revision and Blosc version, respectively. + + 2. Compute the type size in bytes and store it in slot 2. + + 3. Compute the chunk size in bytes and store it in slot 3. +*/ +herr_t blosc_set_local(hid_t dcpl, hid_t type, hid_t space){ + + int ndims; + int i; + herr_t r; + + unsigned int typesize, basetypesize; + unsigned int bufsize; + hsize_t chunkdims[32]; + unsigned int flags; + size_t nelements = 8; + unsigned int values[] = {0,0,0,0,0,0,0,0}; + hid_t super_type; + H5T_class_t classt; + + r = GET_FILTER(dcpl, FILTER_BLOSC, &flags, &nelements, values, 0, NULL); + if(r<0) return -1; + + if(nelements < 4) nelements = 4; /* First 4 slots reserved. */ + + /* Set Blosc info in first two slots */ + values[0] = FILTER_BLOSC_VERSION; + values[1] = BLOSC_VERSION_FORMAT; + + ndims = H5Pget_chunk(dcpl, 32, chunkdims); + if(ndims<0) return -1; + if(ndims>32){ + PUSH_ERR("blosc_set_local", H5E_CALLBACK, "Chunk rank exceeds limit"); + return -1; + } + + typesize = H5Tget_size(type); + if (typesize==0) return -1; + /* Get the size of the base type, even for ARRAY types */ + classt = H5Tget_class(type); + if (classt == H5T_ARRAY) { + /* Get the array base component */ + super_type = H5Tget_super(type); + basetypesize = H5Tget_size(super_type); + /* Release resources */ + H5Tclose(super_type); + } + else { + basetypesize = typesize; + } + + /* Limit large typesizes (they are pretty inneficient to shuffle + and, in addition, Blosc does not handle typesizes larger than + 256 bytes). */ + if (basetypesize > BLOSC_MAX_TYPESIZE) basetypesize = 1; + values[2] = basetypesize; + + /* Get the size of the chunk */ + bufsize = typesize; + for (i=0; i= 5) { + clevel = cd_values[4]; /* The compression level */ + } + if (cd_nelmts >= 6) { + doshuffle = cd_values[5]; /* BLOSC_SHUFFLE, BLOSC_BITSHUFFLE */ + /* bitshuffle is only meant for production in >= 1.8.0 */ +#if ( (BLOSC_VERSION_MAJOR <= 1) && (BLOSC_VERSION_MINOR < 8) ) + if (doshuffle == BLOSC_BITSHUFFLE) { + PUSH_ERR("blosc_filter", H5E_CALLBACK, + "this Blosc library version is not supported. Please update to >= 1.8"); + goto failed; + } +#endif + } + if (cd_nelmts >= 7) { + const char *complist; + + compcode = cd_values[6]; /* The Blosc compressor used */ + /* Check that we actually have support for the compressor code */ + complist = blosc_list_compressors(); + code = blosc_compcode_to_compname(compcode, &compname); + if (code == -1) { + PUSH_ERR("blosc_filter", H5E_CALLBACK, + "this Blosc library does not have support for " + "the '%s' compressor, but only for: %s", + compname, complist); + goto failed; + } + } + + /* We're compressing */ + if(!(flags & H5Z_FLAG_REVERSE)){ + + /* Allocate an output buffer exactly as long as the input data; if + the result is larger, we simply return 0. The filter is flagged + as optional, so HDF5 marks the chunk as uncompressed and + proceeds. + */ + + outbuf_size = (*buf_size); + +#ifdef BLOSC_DEBUG + fprintf(stderr, "Blosc: Compress %zd chunk w/buffer %zd\n", + nbytes, outbuf_size); +#endif + + outbuf = malloc(outbuf_size); + + if (outbuf == NULL){ + PUSH_ERR("blosc_filter", H5E_CALLBACK, + "Can't allocate compression buffer"); + goto failed; + } + + blosc_set_compressor(compname); + status = blosc_compress(clevel, doshuffle, typesize, nbytes, + *buf, outbuf, nbytes); + if (status < 0) { + PUSH_ERR("blosc_filter", H5E_CALLBACK, "Blosc compression error"); + goto failed; + } + + /* We're decompressing */ + } else { + /* declare dummy variables */ + size_t cbytes, blocksize; + + free(outbuf); + + /* Extract the exact outbuf_size from the buffer header. + * + * NOTE: the guess value got from "cd_values" corresponds to the + * uncompressed chunk size but it should not be used in a general + * cases since other filters in the pipeline can modify the buffere + * size. + */ + blosc_cbuffer_sizes(*buf, &outbuf_size, &cbytes, &blocksize); + +#ifdef BLOSC_DEBUG + fprintf(stderr, "Blosc: Decompress %zd chunk w/buffer %zd\n", nbytes, outbuf_size); +#endif + + outbuf = malloc(outbuf_size); + + if(outbuf == NULL){ + PUSH_ERR("blosc_filter", H5E_CALLBACK, "Can't allocate decompression buffer"); + goto failed; + } + + status = blosc_decompress(*buf, outbuf, outbuf_size); + if(status <= 0){ /* decompression failed */ + PUSH_ERR("blosc_filter", H5E_CALLBACK, "Blosc decompression error"); + goto failed; + } /* if !status */ + + } /* compressing vs decompressing */ + + if(status != 0){ + free(*buf); + *buf = outbuf; + *buf_size = outbuf_size; + return status; /* Size of compressed/decompressed data */ + } + + failed: + free(outbuf); + return 0; + +} /* End filter function */ diff --git a/hdf5-blosc/src/blosc_filter.h b/hdf5-blosc/src/blosc_filter.h new file mode 100644 index 0000000..f7f0d3a --- /dev/null +++ b/hdf5-blosc/src/blosc_filter.h @@ -0,0 +1,27 @@ +#ifndef FILTER_BLOSC_H +#define FILTER_BLOSC_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include "blosc.h" + +/* Filter revision number, starting at 1 */ +/* #define FILTER_BLOSC_VERSION 1 */ +#define FILTER_BLOSC_VERSION 2 /* multiple compressors since Blosc 1.3 */ + +/* Filter ID registered with the HDF Group */ +#define FILTER_BLOSC 32001 + +/* Registers the filter with the HDF5 library. */ +#if defined(_MSC_VER) +__declspec(dllexport) +#endif /* defined(_MSC_VER) */ +int register_blosc(char **version, char **date); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/hdf5-blosc/src/blosc_plugin.c b/hdf5-blosc/src/blosc_plugin.c new file mode 100644 index 0000000..99d7912 --- /dev/null +++ b/hdf5-blosc/src/blosc_plugin.c @@ -0,0 +1,42 @@ +/* + * Dynamically loaded filter plugin for HDF5 blosc filter. + * + * Author: Kiyoshi Masui + * Created: 2014 + * + */ + + +#include + + +#define H5Z_class_t_vers 2 + +#include "blosc_plugin.h" +#include "blosc_filter.h" + + +// Prototypes for filter function in blosc_filter.c. +size_t blosc_filter(unsigned flags, size_t cd_nelmts, + const unsigned cd_values[], size_t nbytes, + size_t *buf_size, void **buf); + +herr_t blosc_set_local(hid_t dcpl, hid_t type, hid_t space); + + +H5Z_class_t blosc_H5Filter[1] = {{ + H5Z_CLASS_T_VERS, + (H5Z_filter_t)(FILTER_BLOSC), + 1, 1, + "blosc", + NULL, + (H5Z_set_local_func_t)(blosc_set_local), + (H5Z_func_t)(blosc_filter) +}}; + + +H5PL_type_t H5PLget_plugin_type(void) {return H5PL_TYPE_FILTER;} + + +const void* H5PLget_plugin_info(void) {return blosc_H5Filter;} + diff --git a/hdf5-blosc/src/blosc_plugin.h b/hdf5-blosc/src/blosc_plugin.h new file mode 100644 index 0000000..c2295fc --- /dev/null +++ b/hdf5-blosc/src/blosc_plugin.h @@ -0,0 +1,36 @@ +/* + * Dynamically loaded filter plugin for HDF5 blosc filter. + * + * Author: Kiyoshi Masui + * Created: 2014 + * + * + * Header file + * ----------- + * + * This provides dynamically loaded HDF5 filter functionality (introduced + * in HDF5-1.8.11, May 2013) to the blosc HDF5 filter. + * + * Usage: compile as a shared library and install either to the default + * search location for HDF5 filter plugins (on Linux + * /usr/local/hdf5/lib/plugin) or to a location pointed to by the + * HDF5_PLUGIN_PATH environment variable. + * + */ + + +#ifndef PLUGIN_BLOSC_H +#define PLUGIN_BLOSC_H + +#include "H5PLextern.h" + + +H5PL_type_t H5PLget_plugin_type(void); + + +const void* H5PLget_plugin_info(void); + + +#endif // PLUGIN_BLOSC_H + + diff --git a/hdf5-blosc/src/example.c b/hdf5-blosc/src/example.c new file mode 100644 index 0000000..f06d411 --- /dev/null +++ b/hdf5-blosc/src/example.c @@ -0,0 +1,125 @@ +/* + Copyright (C) 2010 Francesc Alted + http://blosc.org + License: MIT (see LICENSE.txt) + + Example program demonstrating use of the Blosc filter from C code. + This is based on the LZF example (http://h5py.alfven.org) by + Andrew Collette. + + To compile this program: + + h5cc blosc_filter.c example.c -o example -lblosc -lpthread + + To run: + + $ ./example + Blosc version info: 1.3.0 ($Date:: 2014-01-11 #$) + Success! + $ h5ls -v example.h5 + Opened "example.h5" with sec2 driver. + dset Dataset {100/100, 100/100, 100/100} + Location: 1:800 + Links: 1 + Chunks: {1, 100, 100} 40000 bytes + Storage: 4000000 logical bytes, 126002 allocated bytes, 3174.55% utilization + Filter-0: blosc-32001 OPT {2, 2, 4, 40000, 4, 1, 2} + Type: native float + +*/ + +#include +#include "hdf5.h" +#include "blosc_filter.h" + +#define SIZE 100*100*100 +#define SHAPE {100,100,100} +#define CHUNKSHAPE {1,100,100} + +int main(){ + + static float data[SIZE]; + static float data_out[SIZE]; + const hsize_t shape[] = SHAPE; + const hsize_t chunkshape[] = CHUNKSHAPE; + char *version, *date; + int r, i; + unsigned int cd_values[7]; + int return_code = 1; + + hid_t fid, sid, dset, plist = 0; + + for(i=0; i0) H5Dclose(dset); + if(sid>0) H5Sclose(sid); + if(plist>0) H5Pclose(plist); + if(fid>0) H5Fclose(fid); + + return return_code; +} diff --git a/hdf5-blosc/travis-before-install.sh b/hdf5-blosc/travis-before-install.sh new file mode 100755 index 0000000..8dacf77 --- /dev/null +++ b/hdf5-blosc/travis-before-install.sh @@ -0,0 +1,16 @@ +#/bin/sh -f + +# things to do for travis-ci in the before_install section + +if ( test "`uname -s`" = "Darwin" ) +then + #cmake v2.8.12 is installed on the Mac workers now + #brew update + #brew install cmake + echo +else + #install a newer cmake since at this time Travis only has version 2.8.7 + sudo add-apt-repository --yes ppa:kalakris/cmake + sudo apt-get update -qq + sudo apt-get install cmake +fi diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..48d0c2e --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,9 @@ +[build-system] +requires = [ + "setuptools >=42.0", + "wheel", + "oldest-supported-numpy", + "packaging", + "Cython >=0.29.21", +] +build-backend = "setuptools.build_meta" diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..dc6427e --- /dev/null +++ b/requirements.txt @@ -0,0 +1,5 @@ +# Keep in sync with tables/req_versions.py and +# doc/source/usersguide/installation.rst +numpy>=1.19.0 +numexpr>=2.6.2 +packaging diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..82b767f --- /dev/null +++ b/setup.cfg @@ -0,0 +1,75 @@ +[metadata] +name = tables +# version = attr: src.VERSION +description = Hierarchical datasets for Python +long_description = + PyTables is a package for managing hierarchical datasets and + designed to efficiently cope with extremely large amounts of + data. PyTables is built on top of the HDF5 library and the + NumPy package and features an object-oriented interface + that, combined with C-code generated from Cython sources, + makes of it a fast, yet extremely easy to use tool for + interactively save and retrieve large amounts of data. +long_description_content_type = text/x-rst +author = Francesc Alted, Ivan Vilata, Antonio Valentino, Anthony Scopatz et al. +author_email = pytables@pytables.org +maintainer = PyTables maintainers +maintainer_email = pytables@pytables.org +url = https://www.pytables.org +license = BSD 3-Clause License +license_files = LICENSE.txt +classifiers = + Development Status :: 5 - Production/Stable + Intended Audience :: Developers + Intended Audience :: Information Technology + Intended Audience :: Science/Research + License :: OSI Approved :: BSD License + Operating System :: Microsoft :: Windows + Operating System :: Unix + Programming Language :: Python + Programming Language :: Python :: 3 + Programming Language :: Python :: 3 :: Only + Programming Language :: Python :: 3.6 + Programming Language :: Python :: 3.7 + Programming Language :: Python :: 3.8 + Programming Language :: Python :: 3.9 + Programming Language :: Python :: 3.10 + Topic :: Database + Topic :: Software Development :: Libraries :: Python Modules +platforms = any +keywords = hdf5 + +[options] +python_requires = >=3.6 +zip_safe = False +# install_requires = +# numpy>=1.19.0 +# numexpr>=2.6.2 +# packaging +packages = find: +include_package_data = True + +[options.entry_points] +console_scripts = + ptdump = tables.scripts.ptdump:main + ptrepack = tables.scripts.ptrepack:main + pt2to3 = tables.scripts.pt2to3:main + pttree = tables.scripts.pttree:main + +[options.extras_require] +doc = + sphinx>=1.1 + sphinx_rtd_theme + numpydoc + ipython + +[options.packages.find] +exclude = + bench + +[options.package_data] +tables.tests = *.h5, *.mat +tables.nodes.tests = *.dat, *.xbm, *.h5 + +[sdist] +formats = gztar diff --git a/setup.py b/setup.py new file mode 100755 index 0000000..ab8f8e3 --- /dev/null +++ b/setup.py @@ -0,0 +1,1042 @@ +#!/usr/bin/env python + +"""Setup script for the tables package""" + +import os +import sys +import ctypes +import shutil +import platform +import tempfile +import textwrap +import subprocess +from pathlib import Path + +# Using ``setuptools`` enables lots of goodies +from setuptools import setup, Extension +from setuptools.command.build_ext import build_ext +import pkg_resources +from packaging.version import Version + + +# The name for the pkg-config utility +PKG_CONFIG = "pkg-config" + + +# Some functions for showing errors and warnings. +def _print_admonition(kind, head, body): + tw = textwrap.TextWrapper( + initial_indent=" ", subsequent_indent=" " + ) + + print(f".. {kind.upper()}:: {head}") + for line in tw.wrap(body): + print(line) + + +def exit_with_error(head, body=""): + _print_admonition("error", head, body) + sys.exit(1) + + +def print_warning(head, body=""): + _print_admonition("warning", head, body) + + +def get_version(filename): + import re + + with open(filename) as fd: + data = fd.read() + + mobj = re.search( + r'''^__version__\s*=\s*(?P['"])(?P.*)(?P=quote)''', + data, re.MULTILINE) + return mobj.group('version') + + +# Get the HDF5 version provided the 'H5public.h' header +def get_hdf5_version(headername): + major, minor, release = None, None, None + for line in headername.read_text().splitlines(): + if "H5_VERS_MAJOR" in line: + major = int(line.split()[2]) + elif "H5_VERS_MINOR" in line: + minor = int(line.split()[2]) + elif "H5_VERS_RELEASE" in line: + release = int(line.split()[2]) + if None not in (major, minor, release): + break + else: + exit_with_error("Unable to detect HDF5 library version!") + return Version(f"{major}.{minor}.{release}") + + +# Get the Blosc version provided the 'blosc.h' header +def get_blosc_version(headername): + major, minor, release = None, None, None + for line in headername.read_text().splitlines(): + if "BLOSC_VERSION_MAJOR" in line: + major = int(line.split()[2]) + elif "BLOSC_VERSION_MINOR" in line: + minor = int(line.split()[2]) + elif "BLOSC_VERSION_RELEASE" in line: + release = int(line.split()[2]) + if None not in (major, minor, release): + break + else: + exit_with_error("Unable to detect Blosc library version!") + return Version(f"{major}.{minor}.{release}") + + +def newer(source, target): + """Return true if 'source' exists and is more recently modified than + 'target', or if 'source' exists and 'target' doesn't. Return false if + both exist and 'target' is the same age or younger than 'source'. + Raise FileNotFoundError if 'source' does not exist. + """ + source = Path(source) + if not source.exists(): + raise FileNotFoundError(f"file '{source.absolute()}' does not exist") + target = Path(target) + if not target.exists(): + return True + return source.stat().st_mtime > target.stat().st_mtime + + +# https://github.com/pypa/setuptools/issues/2806 +def new_compiler(): + from setuptools import Distribution + + build_ext = Distribution().get_command_obj("build_ext") + build_ext.finalize_options() + # register an extension to ensure a compiler is created + build_ext.extensions = [Extension("ignored", ["ignored.c"])] + # disable building fake extensions + build_ext.build_extensions = lambda: None + # run to populate self.compiler + build_ext.run() + return build_ext.compiler + + +def add_from_path(envname, dirs): + dirs.extend( + Path(x) for x in os.environ.get(envname, "").split(os.pathsep) if x + ) + + +def add_from_flags(envname, flag_key, dirs): + dirs.extend( + Path(flag[len(flag_key):]) + for flag in os.environ.get(envname, "").split() + if flag.startswith(flag_key) + ) + + +def _find_file_path(name, locations, prefixes=("",), suffixes=("",)): + for prefix in prefixes: + for suffix in suffixes: + for location in locations: + path = location / f"{prefix}{name}{suffix}" + if path.is_file(): + return str(path) + return None + + +# We need to avoid importing numpy until we can be sure it's installed +# This approach is based on this SO answer http://stackoverflow.com/a/21621689 +# This is also what pandas does. +class BuildExtensions(build_ext): + """Subclass setuptools build_ext command + + BuildExtensions does two things + 1) it makes sure numpy is available + 2) it injects numpy's core/include directory in the include_dirs + parameter of all extensions + 3) it runs the original build_ext command + """ + + def run(self): + # According to + # https://pip.pypa.io/en/stable/reference/pip_install.html#installation-order + # at this point we can be sure pip has already installed numpy + numpy_incl = pkg_resources.resource_filename( + "numpy", "core/include" + ) + + for ext in self.extensions: + if ( + hasattr(ext, "include_dirs") + and numpy_incl not in ext.include_dirs + ): + ext.include_dirs.append(numpy_incl) + + build_ext.run(self) + + +if __name__ == "__main__": + ROOT = Path(__file__).resolve().parent + VERSION = get_version(ROOT.joinpath("tables/__init__.py")) + # Fetch the requisites + requirements = (ROOT / "requirements.txt").read_text().splitlines() + + # `cpuinfo.py` uses multiprocessing to check CPUID flags. On Windows, the + # entire setup script needs to be protected as a result + # For guessing the capabilities of the CPU for C-Blosc + try: + import cpuinfo + + cpu_info = cpuinfo.get_cpu_info() + cpu_flags = cpu_info["flags"] + except Exception as e: + print("cpuinfo failed, assuming no CPU features:", e) + cpu_flags = [] + + # The minimum required versions + min_python_version = (3, 6) + # Check for Python + if sys.version_info < min_python_version: + exit_with_error("You need Python 3.6 or greater to install PyTables!") + print(f"* Using Python {sys.version.splitlines()[0]}") + + try: + import cython + print(f"* Found cython {cython.__version__}") + del cython + except ImportError: + pass + + # Minimum required versions for numpy, numexpr and HDF5 + _min_versions = {} + exec((ROOT / "tables" / "req_versions.py").read_text(), _min_versions) + min_hdf5_version = _min_versions["min_hdf5_version"] + min_blosc_version = _min_versions["min_blosc_version"] + min_blosc_bitshuffle_version = _min_versions[ + "min_blosc_bitshuffle_version" + ] + + # ---------------------------------------------------------------------- + + debug = "--debug" in sys.argv + + # Global variables + lib_dirs = [] + inc_dirs = [Path("hdf5-blosc/src")] + optional_libs = [] + data_files = [] # list of data files to add to packages (mainly for DLL's) + + default_header_dirs = None + default_library_dirs = None + default_runtime_dirs = None + + if os.name == "posix": + prefixes = ("/usr/local", "/sw", "/opt", "/opt/local", "/usr", "/") + prefix_paths = [Path(x) for x in prefixes] + + default_header_dirs = [] + add_from_path("CPATH", default_header_dirs) + add_from_path("C_INCLUDE_PATH", default_header_dirs) + add_from_flags("CPPFLAGS", "-I", default_header_dirs) + add_from_flags("CFLAGS", "-I", default_header_dirs) + default_header_dirs.extend(_tree / "include" for _tree in prefix_paths) + + default_library_dirs = [] + add_from_flags("LDFLAGS", "-L", default_library_dirs) + default_library_dirs.extend( + _tree / _arch + for _tree in prefix_paths + for _arch in ("lib64", "lib") + ) + default_runtime_dirs = default_library_dirs + + elif os.name == "nt": + default_header_dirs = [] # no default, must be given explicitly + default_library_dirs = [] # no default, must be given explicitly + default_runtime_dirs = [ # look for DLL files in ``%PATH%`` + Path(_path) for _path in os.environ["PATH"].split(";") + ] + # Add the \Windows\system to the runtime list (necessary for Vista) + default_runtime_dirs.append(Path("\\windows\\system")) + # Add the \path_to_python\DLLs and tables package to the list + default_runtime_dirs.append( + Path(sys.prefix) / "Lib" / "site-packages" / "tables" + ) + + # Gcc 4.0.1 on Mac OS X 10.4 does not seem to include the default + # header and library paths. See ticket #18. + if sys.platform.lower().startswith("darwin"): + inc_dirs.extend(default_header_dirs) + lib_dirs.extend(default_library_dirs) + + class BasePackage: + _library_prefixes = [] + _library_suffixes = [] + _runtime_prefixes = [] + _runtime_suffixes = [] + _component_dirs = [] + + def __init__( + self, name, tag, header_name, library_name, target_function=None + ): + self.name = name + self.tag = tag + self.header_name = header_name + self.library_name = library_name + self.runtime_name = library_name + self.target_function = target_function + + def find_header_path(self, locations=default_header_dirs): + return _find_file_path( + self.header_name, locations, suffixes=[".h"] + ) + + def find_library_path(self, locations=default_library_dirs): + return _find_file_path( + self.library_name, + locations, + self._library_prefixes, + self._library_suffixes, + ) + + def find_runtime_path(self, locations=default_runtime_dirs): + """ + returns True if the runtime can be found + returns None otherwise + """ + # An explicit path can not be provided for runtime libraries. + # (The argument is accepted for compatibility with previous + # methods.) + + # dlopen() won't tell us where the file is, just whether + # success occurred, so this returns True instead of a filename + for prefix in self._runtime_prefixes: + for suffix in self._runtime_suffixes: + try: + ctypes.CDLL(f"{prefix}{self.runtime_name}{suffix}") + except OSError: + pass + else: + return True + + def _pkg_config(self, flags): + try: + cmd = [PKG_CONFIG] + flags.split() + [self.library_name] + config = subprocess.check_output(cmd, stderr=subprocess.STDOUT) + except (OSError, subprocess.CalledProcessError): + return [] + else: + return config.decode().strip().split() + + def find_directories(self, location, use_pkgconfig=False): + dirdata = [ + (self.header_name, self.find_header_path, default_header_dirs), + ( + self.library_name, + self.find_library_path, + default_library_dirs, + ), + ( + self.runtime_name, + self.find_runtime_path, + default_runtime_dirs, + ), + ] + + locations = [] + if location: + # The path of a custom install of the package has been + # provided, so the directories where the components + # (headers, libraries, runtime) are going to be searched + # are constructed by appending platform-dependent + # component directories to the given path. + # Remove leading and trailing '"' chars that can mislead + # the finding routines on Windows machines + locations = [ + Path(str(location).strip('"')) / compdir + for compdir in self._component_dirs + ] + + if use_pkgconfig: + # header + pkgconfig_header_dirs = [ + Path(d[2:]) + for d in self._pkg_config("--cflags") + if d.startswith("-I") + ] + if pkgconfig_header_dirs: + print( + f"* pkg-config header dirs for {self.name}:", + ", ".join(str(x) for x in pkgconfig_header_dirs), + ) + + # library + pkgconfig_library_dirs = [ + Path(d[2:]) + for d in self._pkg_config("--libs-only-L") + if d.startswith("-L") + ] + if pkgconfig_library_dirs: + print( + f"* pkg-config library dirs for {self.name}:", + ", ".join(str(x) for x in pkgconfig_library_dirs), + ) + + # runtime + pkgconfig_runtime_dirs = pkgconfig_library_dirs + + pkgconfig_dirs = [ + pkgconfig_header_dirs, + pkgconfig_library_dirs, + pkgconfig_runtime_dirs, + ] + else: + pkgconfig_dirs = [None, None, None] + + directories = [None, None, None] # headers, libraries, runtime + for idx, (name, find_path, default_dirs) in enumerate(dirdata): + path = find_path( + pkgconfig_dirs[idx] or locations or default_dirs + ) + if path: + if path is True: + directories[idx] = True + continue + + # Take care of not returning a directory component + # included in the name. For instance, if name is + # 'foo/bar' and path is '/path/foo/bar.h', do *not* + # take '/path/foo', but just '/path'. This also works + # for name 'libfoo.so' and path '/path/libfoo.so'. + # This has been modified to just work over include files. + # For libraries, its names can be something like 'bzip2' + # and if they are located in places like: + # \stuff\bzip2-1.0.3\lib\bzip2.lib + # then, the directory will be returned as '\stuff' (!!) + # F. Alted 2006-02-16 + if idx == 0: + directories[idx] = Path(path[: path.rfind(name)]) + else: + directories[idx] = Path(path).parent + + return tuple(directories) + + class PosixPackage(BasePackage): + _library_prefixes = ["lib"] + _library_suffixes = [".so", ".dylib", ".a"] + _runtime_prefixes = _library_prefixes + _runtime_suffixes = [".so", ".dylib"] + _component_dirs = ["include", "lib", "lib64"] + + class WindowsPackage(BasePackage): + _library_prefixes = [""] + _library_suffixes = [".lib"] + _runtime_prefixes = [""] + _runtime_suffixes = [".dll"] + + # lookup in '.' seems necessary for LZO2 + _component_dirs = ["include", "lib", "dll", "bin", "."] + + def find_runtime_path(self, locations=default_runtime_dirs): + # An explicit path can not be provided for runtime libraries. + # (The argument is accepted for compatibility with previous + # methods.) + return _find_file_path( + self.runtime_name, + default_runtime_dirs, + self._runtime_prefixes, + self._runtime_suffixes, + ) + + if os.name == "posix": + _Package = PosixPackage + _platdep = { # package tag -> platform-dependent components + "HDF5": ["hdf5"], + "LZO2": ["lzo2"], + "LZO": ["lzo"], + "BZ2": ["bz2"], + "BLOSC": ["blosc"], + } + + elif os.name == "nt": + _Package = WindowsPackage + _platdep = { # package tag -> platform-dependent components + "HDF5": ["hdf5", "hdf5"], + "LZO2": ["lzo2", "lzo2"], + "LZO": ["liblzo", "lzo1"], + "BZ2": ["bzip2", "bzip2"], + "BLOSC": ["blosc", "blosc"], + } + + # Copy the next DLL's to binaries by default. + dll_files = [ + # '\\windows\\system\\zlib1.dll', + # '\\windows\\system\\szip.dll', + ] + + if os.environ.get("HDF5_USE_PREFIX", None): + # This is used on CI systems to link against HDF5 library + # The vendored `hdf5.dll` in a wheel is renamed to: + # `pytables_hdf5.dll` This should prevent DLL Hell. + print( + "* HDF5_USE_PREFIX: Trying to build against pytables_hdf5.dll" + ) + _platdep["HDF5"] = ["pytables_hdf5", "pytables_hdf5"] + + if debug: + _platdep["HDF5"] = ["hdf5_D", "hdf5_D"] + + else: + _Package = None + _platdep = {} + exit_with_error(f"Unsupported OS: {os.name}") + + hdf5_package = _Package("HDF5", "HDF5", "H5public", *_platdep["HDF5"]) + hdf5_package.target_function = "H5close" + lzo2_package = _Package( + "LZO 2", "LZO2", str(Path("lzo/lzo1x")), *_platdep["LZO2"] + ) + lzo2_package.target_function = "lzo_version_date" + lzo1_package = _Package("LZO 1", "LZO", "lzo1x", *_platdep["LZO"]) + lzo1_package.target_function = "lzo_version_date" + bzip2_package = _Package("bzip2", "BZ2", "bzlib", *_platdep["BZ2"]) + bzip2_package.target_function = "BZ2_bzlibVersion" + blosc_package = _Package("blosc", "BLOSC", "blosc", *_platdep["BLOSC"]) + blosc_package.target_function = "blosc_list_compressors" # Blosc >= 1.3 + + # ----------------------------------------------------------------- + + def_macros = [("NDEBUG", 1)] + # Define macros for Windows platform + if os.name == "nt": + def_macros.append(("WIN32", 1)) + def_macros.append(("_HDF5USEDLL_", 1)) + def_macros.append(("H5_BUILT_AS_DYNAMIC_LIB", 1)) + + # Allow setting the HDF5 dir and additional link flags either in + # the environment or on the command line. + # First check the environment... + HDF5_DIR = os.environ.get("HDF5_DIR", "") + LZO_DIR = os.environ.get("LZO_DIR", "") + BZIP2_DIR = os.environ.get("BZIP2_DIR", "") + BLOSC_DIR = os.environ.get("BLOSC_DIR", "") + LFLAGS = os.environ.get("LFLAGS", "").split() + # in GCC-style compilers, -w in extra flags will get rid of copious + # 'uninitialized variable' Cython warnings. However, this shouldn't be + # the default as it will suppress *all* the warnings, which definitely + # is not a good idea. + CFLAGS = os.environ.get("CFLAGS", "").split() + LIBS = os.environ.get("LIBS", "").split() + CONDA_PREFIX = os.environ.get("CONDA_PREFIX", "") + # We start using pkg-config since some distributions are putting HDF5 + # (and possibly other libraries) in exotic locations. See issue #442. + if shutil.which(PKG_CONFIG): + USE_PKGCONFIG = os.environ.get("USE_PKGCONFIG", "TRUE") + else: + USE_PKGCONFIG = "FALSE" + + # ...then the command line. + # Handle --hdf5=[PATH] --lzo=[PATH] --bzip2=[PATH] --blosc=[PATH] + # --lflags=[FLAGS] --cflags=[FLAGS] and --debug + for arg in list(sys.argv): + key, _, val = arg.partition("=") + if key == "--hdf5": + HDF5_DIR = Path(val).expanduser() + elif key == "--lzo": + LZO_DIR = Path(val).expanduser() + elif key == "--bzip2": + BZIP2_DIR = Path(val).expanduser() + elif key == "--blosc": + BLOSC_DIR = Path(val).expanduser() + elif key == "--lflags": + LFLAGS = val.split() + elif key == "--cflags": + CFLAGS = val.split() + elif key == "--debug": + # For debugging (mainly compression filters) + if os.name != "nt": # to prevent including dlfcn.h by utils.c!!! + def_macros = [("DEBUG", 1)] + # Don't delete this argument. It maybe useful for distutils + # when adding more flags later on + continue + elif key == "--use-pkgconfig": + USE_PKGCONFIG = val + CONDA_PREFIX = "" + elif key == "--no-conda": + CONDA_PREFIX = "" + else: + continue + sys.argv.remove(arg) + + USE_PKGCONFIG = USE_PKGCONFIG.upper() == "TRUE" + print("* USE_PKGCONFIG:", USE_PKGCONFIG) + + # For windows, search for the hdf5 dll in the path and use it if found. + # This is much more convenient than having to manually set an environment + # variable to rebuild pytables + if not HDF5_DIR and os.name == "nt": + import ctypes.util + + if not debug: + libdir = ctypes.util.find_library( + "hdf5.dll" + ) or ctypes.util.find_library("hdf5dll.dll") + else: + libdir = ctypes.util.find_library( + "hdf5_D.dll" + ) or ctypes.util.find_library("hdf5ddll.dll") + # Like 'C:\\Program Files\\HDF Group\\HDF5\\1.8.8\\bin\\hdf5dll.dll' + if libdir: + # Strip off the filename and the 'bin' directory + HDF5_DIR = Path(libdir).parent.parent + print(f"* Found HDF5 using system PATH ('{libdir}')") + + if CONDA_PREFIX: + CONDA_PREFIX = Path(CONDA_PREFIX) + print(f"* Found conda env: ``{CONDA_PREFIX}``") + if os.name == "nt": + CONDA_PREFIX = CONDA_PREFIX / "Library" + + # The next flag for the C compiler is needed for finding the C headers for + # the Cython extensions + CFLAGS.append("-Isrc") + + # Force the 1.8.x HDF5 API even if the library as been compiled to use the + # 1.6.x API by default + CFLAGS.extend([ + "-DH5_USE_18_API", + "-DH5Acreate_vers=2", + "-DH5Aiterate_vers=2", + "-DH5Dcreate_vers=2", + "-DH5Dopen_vers=2", + "-DH5Eclear_vers=2", + "-DH5Eprint_vers=2", + "-DH5Epush_vers=2", + "-DH5Eset_auto_vers=2", + "-DH5Eget_auto_vers=2", + "-DH5Ewalk_vers=2", + "-DH5E_auto_t_vers=2", + "-DH5Gcreate_vers=2", + "-DH5Gopen_vers=2", + "-DH5Pget_filter_vers=2", + "-DH5Pget_filter_by_id_vers=2", + # "-DH5Pinsert_vers=2", + # "-DH5Pregister_vers=2", + # "-DH5Rget_obj_type_vers=2", + "-DH5Tarray_create_vers=2", + # "-DH5Tcommit_vers=2", + "-DH5Tget_array_dims_vers=2", + # "-DH5Topen_vers=2", + "-DH5Z_class_t_vers=2", + ]) + + # H5Oget_info_by_name seems to have performance issues (see gh-402), so we + # need to use teh deprecated H5Gget_objinfo function + # CFLAGS.append("-DH5_NO_DEPRECATED_SYMBOLS") + + # Do not use numpy deprecated API + CFLAGS.append("-DNPY_NO_DEPRECATED_API=NPY_1_7_API_VERSION") + + # Try to locate the compulsory and optional libraries. + lzo2_enabled = False + compiler = new_compiler() + for (package, location) in [ + (hdf5_package, HDF5_DIR), + (lzo2_package, LZO_DIR), + (lzo1_package, LZO_DIR), + (bzip2_package, BZIP2_DIR), + (blosc_package, BLOSC_DIR), + ]: + + if package.tag == "LZO" and lzo2_enabled: + print( + f"* Skipping detection of {lzo1_package.name} " + f"since {lzo2_package.name} has already been found." + ) + continue # do not use LZO 1 if LZO 2 is available + + # if a package location is not specified, try to find it in conda env + if not location and CONDA_PREFIX: + location = CONDA_PREFIX + + # looking for lzo/lzo1x.h but pkgconfig already returns + # '/usr/include/lzo' + use_pkgconfig = USE_PKGCONFIG and package.tag != "LZO2" + + (hdrdir, libdir, rundir) = package.find_directories( + location, use_pkgconfig=use_pkgconfig + ) + + # check if HDF5 library uses old DLL naming scheme + if hdrdir and package.tag == "HDF5": + hdf5_version = get_hdf5_version(Path(hdrdir) / "H5public.h") + if hdf5_version < min_hdf5_version: + exit_with_error( + f"Unsupported HDF5 version! HDF5 v{min_hdf5_version}+ " + f"required. Found version v{hdf5_version}" + ) + + if os.name == "nt" and hdf5_version < Version("1.8.10"): + # Change in DLL naming happened in 1.8.10 + hdf5_old_dll_name = "hdf5dll" if not debug else "hdf5ddll" + package.library_name = hdf5_old_dll_name + package.runtime_name = hdf5_old_dll_name + _platdep["HDF5"] = [hdf5_old_dll_name, hdf5_old_dll_name] + _, libdir, rundir = package.find_directories( + location, use_pkgconfig=USE_PKGCONFIG + ) + + # check if the library is in the standard compiler paths + if not libdir and package.target_function: + libdir = compiler.has_function( + package.target_function, libraries=(package.library_name,) + ) + + if not (hdrdir and libdir): + if package.tag in ["HDF5"]: # these are compulsory! + pname, ptag = package.name, package.tag + exit_with_error( + f"Could not find a local {pname} installation.", + f"You may need to explicitly state where your local " + f"{pname} headers and library can be found by setting " + f"the ``{ptag}_DIR`` environment variable or by using " + f"the ``--{ptag.lower()}`` command-line option.", + ) + if package.tag == "BLOSC": + # this is optional, but comes with sources + print( + f"* Could not find {package.name} headers and library; " + f"using internal sources." + ) + else: + print( + f"* Could not find {package.name} headers and library; " + f"disabling support for it." + ) + + continue # look for the next library + + if libdir in ("", True): + print( + f"* Found {package.name} headers at ``{hdrdir}``, the library " + f"is located in the standard system search dirs." + ) + else: + print( + f"* Found {package.name} headers at ``{hdrdir}``, " + f"library at ``{libdir}``." + ) + + if hdrdir not in default_header_dirs: + inc_dirs.append(Path(hdrdir)) # save header directory if needed + if libdir not in default_library_dirs and libdir not in ("", True): + # save library directory if needed + lib_dirs.append(Path(libdir)) + + if package.tag not in ["HDF5"]: + # Keep record of the optional libraries found. + optional_libs.append(package.tag) + def_macros.append((f"HAVE_{package.tag}_LIB", 1)) + + if hdrdir and package.tag == "BLOSC": + blosc_version = get_blosc_version(Path(hdrdir) / "blosc.h") + if blosc_version < min_blosc_version: + optional_libs.pop() # Remove Blosc from the discovered libs + print_warning( + f"Unsupported Blosc version installed! Blosc " + f"{min_blosc_version}+ required. Found version " + f"{blosc_version}. Using internal Blosc sources." + ) + if blosc_version < min_blosc_bitshuffle_version: + print_warning( + f"This Blosc version does not support the BitShuffle " + f"filter. Minimum desirable version is " + f"{min_blosc_bitshuffle_version}. " + f"Found version: {blosc_version}" + ) + + if not rundir: + loc = { + "posix": "the default library paths", + "nt": "any of the directories in %%PATH%%", + }[os.name] + + if "bdist_wheel" in sys.argv and os.name == "nt": + exit_with_error( + f"Could not find the {package.name} runtime.", + f"The {package.name} shared library was *not* found in " + f"{loc}. Cannot build wheel without the runtime.", + ) + else: + print_warning( + f"Could not find the {package.name} runtime.", + f"The {package.name} shared library was *not* found " + f"in {loc}. In case of runtime problems, please " + f"remember to install it.", + ) + + if os.name == "nt": + # LZO DLLs cannot be copied to the binary package for license + # reasons + if package.tag not in ["LZO", "LZO2"]: + dll_file = f"{_platdep[package.tag][1]}.dll" + # If DLL is not in rundir, do nothing. This can be useful + # for BZIP2, that can be linked either statically (.LIB) + # or dynamically (.DLL) + if rundir is not None: + dll_files.append(Path(rundir) / dll_file) + + if os.name == "nt" and package.tag in ["HDF5"]: + # hdf5.dll usually depends on zlib.dll + import ctypes.util + z_lib_path = ctypes.util.find_library("zlib.dll") + if z_lib_path: + print(f"* Adding zlib.dll (hdf5 dependency): ``{z_lib_path}``") + dll_files.append(z_lib_path) + + if package.tag == "LZO2": + lzo2_enabled = True + + lzo_package = lzo2_package if lzo2_enabled else lzo1_package + + # ------------------------------------------------------------------------------ + + cython_extnames = [ + "utilsextension", + "hdf5extension", + "tableextension", + "linkextension", + "_comp_lzo", + "_comp_bzip2", + "lrucacheextension", + "indexesextension", + ] + + def get_cython_extfiles(extnames): + extdir = Path("tables") + extfiles = {} + + for extname in extnames: + extfile = extdir / extname + extpfile = extfile.with_suffix(".pyx") + extcfile = extfile.with_suffix(".c") + + if not extcfile.exists() or newer(extpfile, extcfile): + # This is the only place where Cython is needed, but every + # developer should have it installed, so it should not be + # a hard requisite + from Cython.Build import cythonize + + cythonize(str(extpfile), language_level="2") + extfiles[extname] = extcfile + + return extfiles + + cython_extfiles = get_cython_extfiles(cython_extnames) + + # -------------------------------------------------------------------- + if os.name == "nt": + for dll_file in dll_files: + shutil.copy(dll_file, 'tables') + dll_dir = Path('tables') + dll_files = [dll_dir / Path(dll_file).name for dll_file in dll_files] + + # Add DLL's to the final package for windows + data_files.append((Path("Lib/site-packages/tables"), dll_files)) + + ADDLIBS = [hdf5_package.library_name] + + # List of Blosc file dependencies + blosc_path = Path("c-blosc/blosc") + int_complibs_path = Path("c-blosc/internal-complibs") + + blosc_sources = [Path("hdf5-blosc/src/blosc_filter.c")] + if "BLOSC" not in optional_libs: + if not os.environ.get("PYTABLES_NO_EMBEDDED_LIBS", None) is None: + exit_with_error( + "Unable to find the blosc library. " + "The embedded copy of the blosc sources can't be used because " + "the PYTABLES_NO_EMBEDDED_LIBS environment variable has been " + "specified)." + ) + + # Compiling everything from sources + # Blosc + BloscLZ sources + blosc_sources += [ + f + for f in blosc_path.glob("*.c") + if "avx2" not in f.stem and "sse2" not in f.stem + ] + blosc_sources += int_complibs_path.glob("lz4*/*.c") # LZ4 sources + blosc_sources += int_complibs_path.glob("zlib*/*.c") # Zlib sources + blosc_sources += int_complibs_path.glob("zstd*/*/*.c") # Zstd sources + # Finally, add all the include dirs... + inc_dirs += [blosc_path] + inc_dirs += int_complibs_path.glob("*") + inc_dirs += int_complibs_path.glob("zstd*/common") + inc_dirs += int_complibs_path.glob("zstd*") + # ...and the macros for all the compressors supported + def_macros += [("HAVE_LZ4", 1), ("HAVE_ZLIB", 1), ("HAVE_ZSTD", 1)] + + # Add extra flags for optimizing shuffle in include Blosc + def compiler_has_flags(compiler, flags): + with tempfile.NamedTemporaryFile( + mode="w", suffix=".c", delete=False + ) as fd: + fd.write("int main() {return 0;}") + + try: + compiler.compile([fd.name], extra_preargs=flags) + except Exception: + return False + else: + return True + finally: + Path(fd.name).unlink() + + # Set flags for SSE2 and AVX2 preventing false detection in case + # of emulation + if platform.machine() != 'aarch64': + # SSE2 + if "sse2" in cpu_flags and "DISABLE_SSE2" not in os.environ: + print("SSE2 detected and enabled") + CFLAGS.append("-DSHUFFLE_SSE2_ENABLED") + if os.name == "nt": + # Windows always should have support for SSE2 + # (present in all x86/amd64 architectures since 2003) + def_macros += [("__SSE2__", 1)] + else: + # On UNIX, both gcc and clang understand -msse2 + CFLAGS.append("-msse2") + blosc_sources += blosc_path.glob("*sse2*.c") + + # AVX2 + if "avx2" in cpu_flags and "DISABLE_AVX2" not in os.environ: + print("AVX2 detected and enabled") + if os.name == "nt": + def_macros += [("__AVX2__", 1)] + CFLAGS.append("-DSHUFFLE_AVX2_ENABLED") + blosc_sources += blosc_path.glob("*avx2*.c") + elif compiler_has_flags(compiler, ["-mavx2"]): + CFLAGS.append("-DSHUFFLE_AVX2_ENABLED") + CFLAGS.append("-mavx2") + blosc_sources += blosc_path.glob("*avx2*.c") + else: + ADDLIBS += ["blosc"] + + utilsExtension_libs = LIBS + ADDLIBS + hdf5Extension_libs = LIBS + ADDLIBS + tableExtension_libs = LIBS + ADDLIBS + linkExtension_libs = LIBS + ADDLIBS + indexesExtension_libs = LIBS + ADDLIBS + lrucacheExtension_libs = [] # Doesn't need external libraries + + # Compressor modules only need other libraries if they are enabled. + _comp_lzo_libs = LIBS[:] + _comp_bzip2_libs = LIBS[:] + for (package, complibs) in [ + (lzo_package, _comp_lzo_libs), + (bzip2_package, _comp_bzip2_libs), + ]: + + if package.tag in optional_libs: + complibs.extend([hdf5_package.library_name, package.library_name]) + + # Extension expects strings, so we have to convert Path to str + blosc_sources = [str(x) for x in blosc_sources] + inc_dirs = [str(x) for x in inc_dirs] + + extension_kwargs = { + "extra_compile_args": CFLAGS, + "extra_link_args": LFLAGS, + "library_dirs": [str(x) for x in lib_dirs], + "define_macros": def_macros, + "include_dirs": [str(x) for x in inc_dirs], + } + + extensions = [ + Extension( + "tables.utilsextension", + sources=[ + str(cython_extfiles["utilsextension"]), + "src/utils.c", + "src/H5ARRAY.c", + "src/H5ATTR.c", + ] + + blosc_sources, + libraries=utilsExtension_libs, + **extension_kwargs, + ), + Extension( + "tables.hdf5extension", + sources=[ + str(cython_extfiles["hdf5extension"]), + "src/utils.c", + "src/typeconv.c", + "src/H5ARRAY.c", + "src/H5ARRAY-opt.c", + "src/H5VLARRAY.c", + "src/H5ATTR.c", + ] + + blosc_sources, + libraries=hdf5Extension_libs, + **extension_kwargs, + ), + Extension( + "tables.tableextension", + sources=[ + str(cython_extfiles["tableextension"]), + "src/utils.c", + "src/typeconv.c", + "src/H5TB-opt.c", + "src/H5ATTR.c", + ] + + blosc_sources, + libraries=tableExtension_libs, + **extension_kwargs, + ), + Extension( + "tables._comp_lzo", + sources=[str(cython_extfiles["_comp_lzo"]), "src/H5Zlzo.c"], + libraries=_comp_lzo_libs, + **extension_kwargs, + ), + Extension( + "tables._comp_bzip2", + sources=[str(cython_extfiles["_comp_bzip2"]), "src/H5Zbzip2.c"], + libraries=_comp_bzip2_libs, + **extension_kwargs, + ), + Extension( + "tables.linkextension", + sources=[str(cython_extfiles["linkextension"])], + libraries=tableExtension_libs, + **extension_kwargs, + ), + Extension( + "tables.lrucacheextension", + sources=[str(cython_extfiles["lrucacheextension"])], + libraries=lrucacheExtension_libs, + **extension_kwargs, + ), + Extension( + "tables.indexesextension", + sources=[ + str(cython_extfiles["indexesextension"]), + "src/H5ARRAY-opt.c", + "src/idx-opt.c", + ], + libraries=indexesExtension_libs, + **extension_kwargs, + ), + ] + + setup( + version=VERSION, + install_requires=requirements, + ext_modules=extensions, + cmdclass={"build_ext": BuildExtensions}, + data_files=[ + (str(parent), [str(file) for file in files]) + for parent, files in data_files + ], + ) diff --git a/src/H5ARRAY-opt.c b/src/H5ARRAY-opt.c new file mode 100644 index 0000000..ddbbece --- /dev/null +++ b/src/H5ARRAY-opt.c @@ -0,0 +1,304 @@ +#include "H5ARRAY-opt.h" + +#include +#include + +/*------------------------------------------------------------------------- + * Function: H5ARRAYOread_readSlice + * + * Purpose: Read records from an opened Array + * + * Return: Success: 0, Failure: -1 + * + * Programmer: Francesc Alted, faltet@pytables.com + * + * Date: May 27, 2004 + * + * Comments: + * + * Modifications: + * + * + *------------------------------------------------------------------------- + */ + +herr_t H5ARRAYOread_readSlice( hid_t dataset_id, + hid_t type_id, + hsize_t irow, + hsize_t start, + hsize_t stop, + void *data ) +{ + hid_t space_id; + hid_t mem_space_id; + hsize_t count[2]; + int rank = 2; + hsize_t offset[2]; + hsize_t stride[2] = {1, 1}; + + + count[0] = 1; + count[1] = stop - start; + offset[0] = irow; + offset[1] = start; + + /* Get the dataspace handle */ + if ( (space_id = H5Dget_space( dataset_id )) < 0 ) + goto out; + + /* Create a memory dataspace handle */ + if ( (mem_space_id = H5Screate_simple( rank, count, NULL )) < 0 ) + goto out; + + /* Define a hyperslab in the dataset of the size of the records */ + if ( H5Sselect_hyperslab(space_id, H5S_SELECT_SET, offset, stride, count, NULL) < 0 ) + goto out; + + /* Read */ + if ( H5Dread( dataset_id, type_id, mem_space_id, space_id, H5P_DEFAULT, data ) < 0 ) + goto out; + + /* Terminate access to the memory dataspace */ + if ( H5Sclose( mem_space_id ) < 0 ) + goto out; + + /* Terminate access to the dataspace */ + if ( H5Sclose( space_id ) < 0 ) + goto out; + +return 0; + +out: + H5Dclose( dataset_id ); + return -1; + +} + + +/*------------------------------------------------------------------------- + * Function: H5ARRAYOinit_readSlice + * + * Purpose: Prepare structures to read specifics arrays faster + * + * Return: Success: 0, Failure: -1 + * + * Programmer: Francesc Alted, faltet@pytables.com + * + * Date: May 18, 2006 + * + * Comments: + * - The H5ARRAYOinit_readSlice and H5ARRAYOread_readSlice + * are intended to read indexes slices only! + * F. Alted 2006-05-18 + * + * Modifications: + * + * + *------------------------------------------------------------------------- + */ + +herr_t H5ARRAYOinit_readSlice( hid_t dataset_id, + hid_t *mem_space_id, + hsize_t count) + +{ + hid_t space_id; + int rank = 2; + hsize_t count2[2] = {1, count}; + + /* Get the dataspace handle */ + if ( (space_id = H5Dget_space(dataset_id )) < 0 ) + goto out; + + /* Create a memory dataspace handle */ + if ( (*mem_space_id = H5Screate_simple(rank, count2, NULL)) < 0 ) + goto out; + + /* Terminate access to the dataspace */ + if ( H5Sclose( space_id ) < 0 ) + goto out; + + return 0; + +out: + H5Dclose(dataset_id); + return -1; + +} + +/*------------------------------------------------------------------------- + * Function: H5ARRAYOread_readSortedSlice + * + * Purpose: Read records from an opened Array + * + * Return: Success: 0, Failure: -1 + * + * Programmer: Francesc Alted, faltet@pytables.com + * + * Date: Aug 11, 2005 + * + * Comments: + * + * Modifications: + * - Modified to cache the mem_space_id as well. + * F. Alted 2005-08-11 + * + * + *------------------------------------------------------------------------- + */ + +herr_t H5ARRAYOread_readSortedSlice( hid_t dataset_id, + hid_t mem_space_id, + hid_t type_id, + hsize_t irow, + hsize_t start, + hsize_t stop, + void *data ) +{ + hid_t space_id; + hsize_t count[2] = {1, stop-start}; + hsize_t offset[2] = {irow, start}; + hsize_t stride[2] = {1, 1}; + + /* Get the dataspace handle */ + if ( (space_id = H5Dget_space(dataset_id)) < 0 ) + goto out; + + /* Define a hyperslab in the dataset of the size of the records */ + if ( H5Sselect_hyperslab(space_id, H5S_SELECT_SET, offset, stride, count, NULL) < 0 ) + goto out; + + /* Read */ + if ( H5Dread( dataset_id, type_id, mem_space_id, space_id, H5P_DEFAULT, data ) < 0 ) + goto out; + + /* Terminate access to the dataspace */ + if ( H5Sclose( space_id ) < 0 ) + goto out; + +return 0; + +out: + H5Dclose( dataset_id ); + return -1; + +} + + +/*------------------------------------------------------------------------- + * Function: H5ARRAYOread_readBoundsSlice + * + * Purpose: Read records from an opened Array + * + * Return: Success: 0, Failure: -1 + * + * Programmer: Francesc Alted, faltet@pytables.com + * + * Date: Aug 19, 2005 + * + * Comments: This is exactly the same as H5ARRAYOread_readSortedSlice, + * but I just want to distinguish the calls in profiles. + * + * Modifications: + * + * + *------------------------------------------------------------------------- + */ + +herr_t H5ARRAYOread_readBoundsSlice( hid_t dataset_id, + hid_t mem_space_id, + hid_t type_id, + hsize_t irow, + hsize_t start, + hsize_t stop, + void *data ) +{ + hid_t space_id; + hsize_t count[2] = {1, stop-start}; + hsize_t offset[2] = {irow, start}; + hsize_t stride[2] = {1, 1}; + + /* Get the dataspace handle */ + if ( (space_id = H5Dget_space(dataset_id)) < 0 ) + goto out; + + /* Define a hyperslab in the dataset of the size of the records */ + if ( H5Sselect_hyperslab(space_id, H5S_SELECT_SET, offset, stride, count, NULL) < 0 ) + goto out; + + /* Read */ + if ( H5Dread( dataset_id, type_id, mem_space_id, space_id, H5P_DEFAULT, data ) < 0 ) + goto out; + + /* Terminate access to the dataspace */ + if ( H5Sclose( space_id ) < 0 ) + goto out; + +return 0; + +out: + H5Dclose( dataset_id ); + return -1; + +} + + +/*------------------------------------------------------------------------- + * Function: H5ARRAYreadSliceLR + * + * Purpose: Reads a slice of LR index cache from disk. + * + * Return: Success: 0, Failure: -1 + * + * Programmer: Francesc Alted, faltet@pytables.com + * + * Date: August 17, 2005 + * + *------------------------------------------------------------------------- + */ + +herr_t H5ARRAYOreadSliceLR(hid_t dataset_id, + hid_t type_id, + hsize_t start, + hsize_t stop, + void *data) +{ + hid_t space_id; + hid_t mem_space_id; + hsize_t count[1] = {stop - start}; + hsize_t stride[1] = {1}; + hsize_t offset[1] = {start}; + + /* Get the dataspace handle */ + if ( (space_id = H5Dget_space(dataset_id)) < 0 ) + goto out; + + /* Define a hyperslab in the dataset of the size of the records */ + if ( H5Sselect_hyperslab(space_id, H5S_SELECT_SET, offset, stride, count, NULL) < 0 ) + goto out; + + /* Create a memory dataspace handle */ + if ( (mem_space_id = H5Screate_simple(1, count, NULL)) < 0 ) + goto out; + + /* Read */ + if ( H5Dread( dataset_id, type_id, mem_space_id, space_id, H5P_DEFAULT, data ) < 0 ) + goto out; + + /* Release resources */ + + /* Terminate access to the memory dataspace */ + if ( H5Sclose( mem_space_id ) < 0 ) + goto out; + + /* Terminate access to the dataspace */ + if ( H5Sclose( space_id ) < 0 ) + goto out; + + return 0; + +out: + H5Dclose( dataset_id ); + return -1; + +} diff --git a/src/H5ARRAY-opt.h b/src/H5ARRAY-opt.h new file mode 100644 index 0000000..ca81a86 --- /dev/null +++ b/src/H5ARRAY-opt.h @@ -0,0 +1,36 @@ +#include + +herr_t H5ARRAYOinit_readSlice( hid_t dataset_id, + hid_t *mem_space_id, + hsize_t count ); + +herr_t H5ARRAYOread_readSlice( hid_t dataset_id, + hid_t type_id, + hsize_t irow, + hsize_t start, + hsize_t stop, + void *data ); + +herr_t H5ARRAYOread_readSortedSlice( hid_t dataset_id, + hid_t mem_space_id, + hid_t type_id, + hsize_t irow, + hsize_t start, + hsize_t stop, + void *data ); + + +herr_t H5ARRAYOread_readBoundsSlice( hid_t dataset_id, + hid_t mem_space_id, + hid_t type_id, + hsize_t irow, + hsize_t start, + hsize_t stop, + void *data ); + +herr_t H5ARRAYOreadSliceLR( hid_t dataset_id, + hid_t type_id, + hsize_t start, + hsize_t stop, + void *data ); + diff --git a/src/H5ARRAY.c b/src/H5ARRAY.c new file mode 100644 index 0000000..b2b8738 --- /dev/null +++ b/src/H5ARRAY.c @@ -0,0 +1,891 @@ +#include "H5ATTR.h" + +#include "tables.h" +#include "utils.h" +#include "H5Zlzo.h" /* Import FILTER_LZO */ +#include "H5Zbzip2.h" /* Import FILTER_BZIP2 */ +#include "blosc_filter.h" /* Import FILTER_BLOSC */ + +#include +#include + +/*------------------------------------------------------------------------- + * + * Public functions + * + *------------------------------------------------------------------------- + */ +/*------------------------------------------------------------------------- + * Function: H5ARRAYmake + * + * Purpose: Creates and writes a dataset of a type type_id + * + * Return: Success: 0, Failure: -1 + * + * Programmer: F. Alted. October 21, 2002 + * + * Date: March 19, 2001 + * + * Comments: Modified by F. Alted. November 07, 2003 + * Modified by A. Cobb. August 21, 2017 (track_times) + * + *------------------------------------------------------------------------- + */ + +hid_t H5ARRAYmake( hid_t loc_id, + const char *dset_name, + const char *obversion, + const int rank, + const hsize_t *dims, + int extdim, + hid_t type_id, + hsize_t *dims_chunk, + void *fill_data, + int compress, + char *complib, + int shuffle, + int fletcher32, + hbool_t track_times, + const void *data) +{ + + hid_t dataset_id, space_id; + hsize_t *maxdims = NULL; + hid_t plist_id = 0; + unsigned int cd_values[7]; + int blosc_compcode; + char *blosc_compname = NULL; + int chunked = 0; + int i; + + /* Check whether the array has to be chunked or not */ + if (dims_chunk) { + chunked = 1; + } + + if(chunked) { + maxdims = malloc(rank*sizeof(hsize_t)); + if(!maxdims) return -1; + + for(i=0;i dims[_extdim] ) { + printf("Asking for a range of rows exceeding the available ones!.\n"); + goto out; + } + + /* Define a hyperslab in the dataset of the size of the records */ + for (i=0; i dims[i] ) { + printf("Asking for a range of rows exceeding the available ones!.\n"); + goto out; + } + } + + /* Define a hyperslab in the dataset of the size of the records */ + if ( H5Sselect_hyperslab( space_id, H5S_SELECT_SET, offset, stride, + count, NULL) < 0 ) + goto out; + + /* Create a memory dataspace handle */ + if ( (mem_space_id = H5Screate_simple( rank, count, NULL )) < 0 ) + goto out; + + /* Read */ + if ( H5Dread( dataset_id, type_id, mem_space_id, space_id, H5P_DEFAULT, + data ) < 0 ) + goto out; + + /* Release resources */ + free(dims); + free(count); + + /* Terminate access to the memory dataspace */ + if ( H5Sclose( mem_space_id ) < 0 ) + goto out; + } + else { /* Scalar case */ + + /* Read all the dataset */ + if (H5Dread(dataset_id, type_id, H5S_ALL, H5S_ALL, H5P_DEFAULT, data) < 0) + goto out; + } + + /* Terminate access to the dataspace */ + if ( H5Sclose( space_id ) < 0 ) + goto out; + + return 0; + +out: +/* H5Dclose( dataset_id ); */ + if (dims) free(dims); + if (count) free(count); + return -1; + +} + + +/* The next represents a try to implement getCoords for != operator */ +/* but it turned out to be too difficult, well, at least to me :( */ +/* 2004-06-22 */ +/*------------------------------------------------------------------------- + * Function: H5ARRAYreadIndex + * + * Purpose: Reads a slice of array from disk for indexing purposes. + * + * Return: Success: 0, Failure: -1 + * + * Programmer: Francesc Alted, faltet@pytables.com + * + * Date: June 21, 2004 + * + *------------------------------------------------------------------------- + */ + +herr_t H5ARRAYreadIndex( hid_t dataset_id, + hid_t type_id, + int notequal, + hsize_t *start, + hsize_t *stop, + hsize_t *step, + void *data ) +{ + + hid_t mem_space_id; + hid_t space_id; + hsize_t *dims = NULL; + hsize_t *count = NULL; + hsize_t *count2 = NULL; + hsize_t *offset2 = NULL; + hsize_t *stride = (hsize_t *)step; + hsize_t *offset = (hsize_t *)start; + int rank; + int i; + + /* Get the dataspace handle */ + if ( (space_id = H5Dget_space( dataset_id )) < 0 ) + goto out; + + /* Get the rank */ + if ( (rank = H5Sget_simple_extent_ndims(space_id)) < 0 ) + goto out; + + if (rank) { /* Array case */ + + /* Book some memory for the selections */ + dims = (hsize_t *)malloc(rank*sizeof(hsize_t)); + count = (hsize_t *)malloc(rank*sizeof(hsize_t)); + count2 = (hsize_t *)malloc(rank*sizeof(hsize_t)); + offset2 = (hsize_t *)malloc(rank*sizeof(hsize_t)); + + /* Get dataset dimensionality */ + if ( H5Sget_simple_extent_dims( space_id, dims, NULL) < 0 ) + goto out; + + for(i=0;i dims[i] ) { + printf("Asking for a range of rows exceeding the available ones!.\n"); + goto out; + } + } + + /* Define a hyperslab in the dataset of the size of the records */ + if ( H5Sselect_hyperslab( space_id, H5S_SELECT_SET, offset, stride, + count, NULL) < 0 ) + goto out; + + /* If we want the complementary, do a NOTA against all the row */ + if (notequal) { + offset2[0] = offset[0]; count2[0] = count[0]; + offset2[1] = 0; count2[1] = dims[1]; /* All the row */ + count[0] = 1; count[1] = dims[1] - count[1]; /* For memory dataspace */ + if ( H5Sselect_hyperslab( space_id, H5S_SELECT_NOTA, offset2, stride, + count2, NULL) < 0 ) + goto out; + } + + /* Create a memory dataspace handle */ + if ( (mem_space_id = H5Screate_simple( rank, count, NULL )) < 0 ) + goto out; + + /* Read */ + if ( H5Dread( dataset_id, type_id, mem_space_id, space_id, H5P_DEFAULT, data ) < 0 ) + goto out; + + /* Release resources */ + free(dims); + free(count); + free(offset2); + free(count2); + + /* Terminate access to the memory dataspace */ + if ( H5Sclose( mem_space_id ) < 0 ) + goto out; + } + else { /* Scalar case */ + + /* Read all the dataset */ + if (H5Dread(dataset_id, type_id, H5S_ALL, H5S_ALL, H5P_DEFAULT, data) < 0) + goto out; + } + + /* Terminate access to the dataspace */ + if ( H5Sclose( space_id ) < 0 ) + goto out; + + return 0; + +out: + if (dims) free(dims); + if (count) free(count); + return -1; + +} + + + +/*------------------------------------------------------------------------- + * Function: H5ARRAYget_ndims + * + * Purpose: Gets the dimensionality of an array. + * + * Return: Success: 0, Failure: -1 + * + * Programmer: Francesc Alted + * + * Date: October 22, 2002 + * + * Modification: October 13, 2008 + * This routine not longer returns the dimensionality of data types + * in case they are H5T_ARRAY. + * + *------------------------------------------------------------------------- + */ + +herr_t H5ARRAYget_ndims( hid_t dataset_id, + int *rank ) +{ + hid_t space_id; + + /* Get the dataspace handle */ + if ( (space_id = H5Dget_space( dataset_id )) < 0 ) + goto out; + + /* Get rank */ + if ( (*rank = H5Sget_simple_extent_ndims( space_id )) < 0 ) + goto out; + + /* Terminate access to the dataspace */ + if ( H5Sclose( space_id ) < 0 ) + goto out; + + + return 0; + +out: + return -1; + +} + + + +/* Modified version of H5LTget_dataset_info. */ + +herr_t H5ARRAYget_info( hid_t dataset_id, + hid_t type_id, + hsize_t *dims, + hsize_t *maxdims, + H5T_class_t *class_id, + char *byteorder) +{ + hid_t space_id; + + /* Get the class. */ + *class_id = H5Tget_class( type_id ); + + /* Get the dataspace handle */ + if ( (space_id = H5Dget_space( dataset_id )) < 0 ) + goto out; + + /* Get dimensions */ + if ( H5Sget_simple_extent_dims( space_id, dims, maxdims) < 0 ) + goto out; + + /* Terminate access to the dataspace */ + if ( H5Sclose( space_id ) < 0 ) + goto out; + + /* Get the byteorder */ + /* Only integer, float, time, enumerate and array classes can be + byteordered */ + if ((*class_id == H5T_INTEGER) || (*class_id == H5T_FLOAT) + || (*class_id == H5T_BITFIELD) || (*class_id == H5T_COMPOUND) + || (*class_id == H5T_TIME) || (*class_id == H5T_ENUM) + || (*class_id == H5T_ARRAY)) { + get_order(type_id, byteorder); + } + else { + strcpy(byteorder, "irrelevant"); + } + + return 0; + +out: + return -1; + +} + + + +/*------------------------------------------------------------------------- + * Function: H5ARRAYget_chunkshape + * + * Purpose: Gets the chunkshape of a dataset. + * + * Return: Success: 0, Failure: -1 + * + * Programmer: Francesc Alted + * + * Date: May 20, 2004 + * + *------------------------------------------------------------------------- + */ + +herr_t H5ARRAYget_chunkshape( hid_t dataset_id, + int rank, + hsize_t *dims_chunk) +{ + hid_t plist_id; + H5D_layout_t layout; + + /* Get creation properties list */ + if ( (plist_id = H5Dget_create_plist( dataset_id )) < 0 ) + goto out; + + /* Get the dataset layout */ + layout = H5Pget_layout(plist_id); + if (layout != H5D_CHUNKED) { + H5Pclose( plist_id ); + return -1; + } + + /* Get the chunkshape for all dimensions */ + if (H5Pget_chunk(plist_id, rank, dims_chunk ) < 0 ) + goto out; + + /* Terminate access to the datatype */ + if ( H5Pclose( plist_id ) < 0 ) + goto out; + + return 0; + +out: + if (dims_chunk) free(dims_chunk); + return -1; + +} + + +/*------------------------------------------------------------------------- + * Function: H5ARRAYget_fill_value + * + * Purpose: Gets the fill value of a dataset. + * + * Return: Success: 0, Failure: -1 + * + * Programmer: Francesc Alted + * + * Date: Mar 03, 2009 + * + *------------------------------------------------------------------------- + */ + +herr_t H5ARRAYget_fill_value( hid_t dataset_id, + hid_t type_id, + int *status, + void *value) +{ + hid_t plist_id; + + /* Get creation properties list */ + if ( (plist_id = H5Dget_create_plist(dataset_id)) < 0 ) + goto out; + + /* How the fill value is defined? */ + if ( (H5Pfill_value_defined(plist_id, status)) < 0 ) + goto out; + + if ( *status == H5D_FILL_VALUE_USER_DEFINED ) { + if ( H5Pget_fill_value(plist_id, type_id, value) < 0 ) + goto out; + } + + /* Terminate access to the datatype */ + if ( H5Pclose( plist_id ) < 0 ) + goto out; + + return 0; + +out: + return -1; + +} diff --git a/src/H5ARRAY.h b/src/H5ARRAY.h new file mode 100644 index 0000000..804a7b5 --- /dev/null +++ b/src/H5ARRAY.h @@ -0,0 +1,95 @@ +#ifndef _H5ARRAY_H +#define _H5ARRAY_H + +#include + +#define TESTING(WHAT) {printf("%-70s", "Testing " WHAT); fflush(stdout);} +#define PASSED() {puts(" PASSED");fflush(stdout);} +#define H5_FAILED() {puts("*FAILED*");fflush(stdout);} +#define SKIPPED() {puts(" -SKIP-");fflush(stdout);} + + +#ifdef __cplusplus +extern "C" { +#endif + +hid_t H5ARRAYmake( hid_t loc_id, + const char *dset_name, + const char *obversion, + const int rank, + const hsize_t *dims, + int extdim, + hid_t type_id, + hsize_t *dims_chunk, + void *fill_data, + int compress, + char *complib, + int shuffle, + int fletcher32, + hbool_t track_times, + const void *data); + +herr_t H5ARRAYappend_records( hid_t dataset_id, + hid_t type_id, + const int rank, + hsize_t *dims_orig, + hsize_t *dims_new, + int extdim, + const void *data ); + +herr_t H5ARRAYwrite_records( hid_t dataset_id, + hid_t type_id, + const int rank, + hsize_t *start, + hsize_t *step, + hsize_t *count, + const void *data ); + +herr_t H5ARRAYread( hid_t dataset_id, + hid_t type_id, + hsize_t start, + hsize_t nrows, + hsize_t step, + int extdim, + void *data ); + +herr_t H5ARRAYreadSlice( hid_t dataset_id, + hid_t type_id, + hsize_t *start, + hsize_t *stop, + hsize_t *step, + void *data ); + +herr_t H5ARRAYreadIndex( hid_t dataset_id, + hid_t type_id, + int notequal, + hsize_t *start, + hsize_t *stop, + hsize_t *step, + void *data ); + +herr_t H5ARRAYget_ndims( hid_t dataset_id, + int *rank ); + +herr_t H5ARRAYget_info( hid_t dataset_id, + hid_t type_id, + hsize_t *dims, + hsize_t *maxdims, + H5T_class_t *class_id, + char *byteorder); + +herr_t H5ARRAYget_chunkshape( hid_t dataset_id, + int rank, + hsize_t *dims_chunk); + +herr_t H5ARRAYget_fill_value( hid_t dataset_id, + hid_t type_id, + int *status, + void *value); + + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/H5ATTR.c b/src/H5ATTR.c new file mode 100644 index 0000000..8bd715d --- /dev/null +++ b/src/H5ATTR.c @@ -0,0 +1,622 @@ +/**************************************************************************** + * NCSA HDF * + * Scientific Data Technologies * + * National Center for Supercomputing Applications * + * University of Illinois at Urbana-Champaign * + * 605 E. Springfield, Champaign IL 61820 * + * * + * For conditions of distribution and use, see the accompanying * + * hdf/COPYING file. * + * * + * Modified versions of H5LT for getting and setting attributes for open + * groups and leaves. + * F. Alted 2005/09/29 + * * + ****************************************************************************/ + +#include +#include + +#include "H5ATTR.h" + + +/*------------------------------------------------------------------------- + * + * Set & get attribute functions + * + *------------------------------------------------------------------------- + */ + +/*------------------------------------------------------------------------- + * Function: H5ATTRset_attribute + * + * Purpose: Create an attribute named attr_name and attach it to the + * object specified by the name obj_name. This supports general + * n-dimensional types (rank > 0), but if rank == 0, an H5T_SCALAR is + * chosen. + * + * Return: Success: 0, Failure: -1 + * + * Programmer: Francesc Alted + * + * Date: October 18, 2006 + * + * Comments: + * + * Modifications: + * + *------------------------------------------------------------------------- + */ + +herr_t H5ATTRset_attribute( hid_t obj_id, + const char *attr_name, + hid_t type_id, + size_t rank, + hsize_t *dims, + const char *attr_data ) +{ + hid_t space_id; + hid_t attr_id; + int has_attr; + + /* Create the data space for the attribute. */ + if (rank == 0) + space_id = H5Screate( H5S_SCALAR ); + else + space_id = H5Screate_simple( rank, dims, NULL ); + + /* Verify whether the attribute already exists */ + has_attr = H5ATTRfind_attribute( obj_id, attr_name ); + + /* The attribute already exists, delete it */ + if ( has_attr == 1 ) + { + if ( H5Adelete( obj_id, attr_name ) < 0 ) + goto out; + } + + /* Create and write the attribute */ + attr_id = H5Acreate( obj_id, attr_name, type_id, space_id, H5P_DEFAULT, + H5P_DEFAULT ); + + if ( H5Awrite( attr_id, type_id, attr_data ) < 0 ) + goto out; + + H5Aclose( attr_id ); + + H5Sclose( space_id ); + + return 0; + +out: + return -1; +} + + +/*------------------------------------------------------------------------- + * Function: H5ATTRset_attribute_string + * + * Purpose: Creates and writes a string attribute named attr_name and attaches + * it to the object specified by the name obj_name. + * + * Return: Success: 0, Failure: -1 + * + * Programmer: Pedro Vicente, pvn@ncsa.uiuc.edu + * + * Date: July 23, 2001 + * + * Comments: If the attribute already exists, it is overwritten + * + * Modifications: + * + *------------------------------------------------------------------------- + */ + +herr_t H5ATTRset_attribute_string( hid_t obj_id, + const char *attr_name, + const char *attr_data, + hsize_t attr_size, + int cset ) +{ + hid_t attr_type; + /*size_t attr_size;*/ + hid_t attr_space_id; + hid_t attr_id; + int has_attr; + + /* Create the attribute */ + if ( (attr_type = H5Tcopy( H5T_C_S1 )) < 0 ) + goto out; + + if ( ( ( cset == H5T_CSET_ASCII ) || ( cset == H5T_CSET_UTF8 ) ) && + ( H5Tset_cset( attr_type, cset ) < 0 ) ) + goto out; + + if ( H5Tset_strpad( attr_type, H5T_STR_NULLTERM ) < 0 ) + goto out; + + if ( attr_size > 0 ) + { + if (H5Tset_size( attr_type, attr_size) < 0 ) + goto out; + if ( (attr_space_id = H5Screate( H5S_SCALAR )) < 0 ) + goto out; + } + else + { + if ( (attr_space_id = H5Screate( H5S_NULL )) < 0 ) + goto out; + } + + /* Verify if the attribute already exists */ + has_attr = H5ATTRfind_attribute( obj_id, attr_name ); + + /* The attribute already exists, delete it */ + if ( has_attr == 1 ) + { + if ( H5Adelete( obj_id, attr_name ) < 0 ) + goto out; + } + + /* Create and write the attribute */ + + if ( (attr_id = H5Acreate( obj_id, attr_name, attr_type, attr_space_id, + H5P_DEFAULT, H5P_DEFAULT )) < 0 ) + goto out; + + if ( H5Awrite( attr_id, attr_type, attr_data ) < 0 ) + goto out; + + if ( H5Aclose( attr_id ) < 0 ) + goto out; + + if ( H5Sclose( attr_space_id ) < 0 ) + goto out; + + if ( H5Tclose(attr_type) < 0 ) + goto out; + + return 0; + +out: + return -1; +} + + +/*------------------------------------------------------------------------- + * Function: H5ATTRget_attribute + * + * Purpose: Reads an attribute named attr_name with the memory type type_id + * + * Return: Success: 0, Failure: -1 + * + * Programmer: Pedro Vicente, pvn@ncsa.uiuc.edu + * + * Date: September 19, 2002 + * + * Comments: + * + * Modifications: + * + *------------------------------------------------------------------------- + */ + +herr_t H5ATTRget_attribute( hid_t obj_id, + const char *attr_name, + hid_t type_id, + void *data ) +{ + + /* identifiers */ + hid_t attr_id; + + if ( ( attr_id = H5Aopen_by_name(obj_id, ".", attr_name, + H5P_DEFAULT, H5P_DEFAULT) ) < 0 ) + return -1; + + if ( H5Aread( attr_id, type_id, data ) < 0 ) + goto out; + + if ( H5Aclose( attr_id ) < 0 ) + return -1; + + return 0; + +out: + H5Aclose( attr_id ); + return -1; +} + + +/*------------------------------------------------------------------------- + * Function: H5ATTRget_attribute_string + * + * Purpose: Reads an string attribute named attr_name. + * + * Return: Success: 0, Failure: -1 + * + * Programmer: Francesc Alted, faltet@pytables.com + * + * Date: February 23, 2005 + * + * Comments: + * + * Modifications: + * + *------------------------------------------------------------------------- + */ + +hsize_t H5ATTRget_attribute_string( hid_t obj_id, + const char *attr_name, + char **data, + int *cset ) +{ + /* identifiers */ + hid_t attr_id; + hid_t attr_type; + hid_t space_id; + hsize_t type_size = 0; + htri_t is_vlstr = 0; + + *data = NULL; + if ( ( attr_id = H5Aopen_by_name(obj_id, ".", attr_name, + H5P_DEFAULT, H5P_DEFAULT) ) < 0 ) + return -1; + + if ( (attr_type = H5Aget_type( attr_id )) < 0 ) + goto out; + + if ( ( cset != NULL ) && ( ( *cset = H5Tget_cset( attr_type ) ) < 0 ) ) + goto out; + + is_vlstr = H5Tis_variable_str( attr_type ); + if ( is_vlstr == 0 ) + { + /* Get the size */ + if ( (type_size = H5Tget_size( attr_type )) < 0 ) + goto out; + + if ( (space_id = H5Aget_space( attr_id )) < 0 ) + goto out; + + if ( H5Sget_simple_extent_type( space_id ) == H5S_NULL ) + type_size = 0; + + H5Sclose( space_id ); + + /* Malloc space enough for the string, plus 1 for the trailing '\0' */ + *data = (char *)malloc(type_size + 1); + + if ( type_size > 0) + { + if ( H5Aread( attr_id, attr_type, *data ) < 0 ) + goto out; + } + + /* Set the last character to \0 in case we are dealing with space + padded strings */ + (*data)[type_size] = '\0'; + } + else + { + /* is_vlstr */ + if ( H5Aread( attr_id, attr_type, data ) < 0 ) + goto out; + + type_size = strlen( *data ); + } + + if ( H5Tclose( attr_type ) < 0 ) + goto out; + + if ( H5Aclose( attr_id ) < 0 ) + return -1; + + return type_size; + +out: + H5Tclose( attr_type ); + H5Aclose( attr_id ); + if ( (is_vlstr == 0) && (*data != NULL) ) + free(*data); + *data = NULL; + return -1; +} + + +/*------------------------------------------------------------------------- + * Function: H5ATTRget_attribute_vlen_string_array + * + * Purpose: Reads a variable length string attribute named attr_name. + * + * Return: Success: number of elements of the array, Failure: -1 + * + * Programmer: Antonio Valentino + * + * Date: November 27, 2011 + * + * Comments: only rank 1 attributes of 8bit strings are supported + * + * Modifications: + * + *------------------------------------------------------------------------- + */ + +hsize_t H5ATTRget_attribute_vlen_string_array( hid_t obj_id, + const char *attr_name, + char ***data, + int *cset ) +{ + /* identifiers */ + hid_t attr_id = -1, attr_type = -1, space_id = -1; + hsize_t nelements = 0, *dims = NULL; + int ndims = 0, i; + + *data = NULL; + if ( ( attr_id = H5Aopen_by_name( obj_id, ".", attr_name, + H5P_DEFAULT, H5P_DEFAULT ) ) < 0 ) + return -1; + + if ( (attr_type = H5Aget_type( attr_id )) < 0 ) + goto out; + + if ( ( cset != NULL ) && ( ( *cset = H5Tget_cset( attr_type ) ) < 0 ) ) + goto out; + + if ( (space_id = H5Aget_space( attr_id )) < 0 ) + goto out; + + if ( (ndims = H5Sget_simple_extent_ndims( space_id )) < 1 ) + goto out; + + if ( (dims = (hsize_t *)malloc(ndims * sizeof(hsize_t))) == NULL ) + goto out; + + if ( H5Sget_simple_extent_dims( space_id, dims, NULL ) < 0 ) + goto out; + + nelements = 1; + for ( i = 0; i < ndims; ++i ) + nelements *= dims[i]; + + free( dims ); + dims = NULL; + + if ((*data = (char **)malloc( nelements * sizeof(char*))) == NULL ) + goto out; + + if ( H5Aread( attr_id, attr_type, *data ) < 0 ) + goto out; + + if ( H5Tclose( attr_type ) < 0 ) + goto out; + + if ( H5Sclose( space_id ) < 0 ) + goto out; + + if ( H5Aclose( attr_id ) < 0 ) + return -1; + + return nelements; + +out: + if ( *data != NULL ) + free( *data ); + *data = NULL; + if ( dims != NULL ) + free( dims ); + H5Tclose( attr_type ); + H5Sclose( space_id ); + H5Aclose( attr_id ); + return -1; +} + + +/*------------------------------------------------------------------------- + * + * Helper functions + * + *------------------------------------------------------------------------- + */ + +/*------------------------------------------------------------------------- + * Function: find_attr + * + * Purpose: operator function used by H5ATTRfind_attribute + * + * Programmer: Pedro Vicente, pvn@ncsa.uiuc.edu + * + * Date: June 21, 2001 + * + * Comments: + * + * Modifications: + * + *------------------------------------------------------------------------- + */ + +static herr_t find_attr( hid_t loc_id, + const char *name, + const H5A_info_t *ainfo, + void *op_data) +{ + + /* Define a default zero value for return. This will cause the + * iterator to continue if the palette attribute is not found yet. + */ + + int ret = 0; + + char *attr_name = (char*)op_data; + + /* Shut the compiler up */ + loc_id=loc_id; + + /* Define a positive value for return value if the attribute was + * found. This will cause the iterator to immediately return that + * positive value, indicating short-circuit success + */ + + if( strcmp( name, attr_name ) == 0 ) + ret = 1; + + return ret; +} + + +/*------------------------------------------------------------------------- + * Function: H5ATTRfind_attribute + * + * Purpose: Inquires if an attribute named attr_name exists attached + * to the object loc_id. + * + * Programmer: Pedro Vicente, pvn@ncsa.uiuc.edu + * + * Date: June 21, 2001 + * + * Comments: + * The function uses H5Aiterate with the operator function find_attr + * + * Return: + * Success: The return value of the first operator that + * returns non-zero, or zero if all members were + * processed with no operator returning non-zero. + * + * Failure: Negative if something goes wrong within the + * library, or the negative value returned by one + * of the operators. + * + *------------------------------------------------------------------------- + */ + +herr_t H5ATTRfind_attribute( hid_t loc_id, + const char* attr_name ) +{ + + hsize_t attr_num; + herr_t ret; + + attr_num = 0; + ret = H5Aiterate( loc_id, H5_INDEX_CRT_ORDER, H5_ITER_NATIVE, &attr_num, + find_attr, (void *)attr_name ); + + return ret; +} + + + +/*------------------------------------------------------------------------- + * Function: H5ATTRget_attribute_ndims + * + * Purpose: Gets the dimensionality of an attribute. + * + * Return: Success: 0, Failure: -1 + * + * Programmer: Pedro Vicente, pvn@ncsa.uiuc.edu + * + * Date: September 4, 2001 + * + *------------------------------------------------------------------------- + */ + +herr_t H5ATTRget_type_ndims( hid_t obj_id, + const char *attr_name, + hid_t *type_id, + H5T_class_t *class_id, + size_t *type_size, + int *rank ) +{ + hid_t attr_id; + hid_t space_id; + + /* Open the attribute. */ + if ( ( attr_id = H5Aopen_by_name(obj_id, ".", attr_name, + H5P_DEFAULT, H5P_DEFAULT) ) < 0 ) + { + return -1; + } + + /* Get an identifier for the datatype. */ + *type_id = H5Aget_type( attr_id ); + + /* Get the class. */ + *class_id = H5Tget_class( *type_id ); + + /* Get the size. */ + *type_size = H5Tget_size( *type_id ); + + /* Get the dataspace handle */ + if ( (space_id = H5Aget_space( attr_id )) < 0 ) + goto out; + + /* Get rank */ + if ( (*rank = H5Sget_simple_extent_ndims( space_id )) < 0 ) + goto out; + + /* Terminate access to the attribute */ + if ( H5Sclose( space_id ) < 0 ) + goto out; + + /* End access to the attribute */ + if ( H5Aclose( attr_id ) ) + goto out;; + + return 0; + +out: + H5Tclose( *type_id ); + H5Aclose( attr_id ); + return -1; +} + + +/*------------------------------------------------------------------------- + * Function: H5ATTRget_dims + * + * Purpose: Gets information about an attribute. + * + * Return: Success: 0, Failure: -1 + * + * Programmer: Pedro Vicente, pvn@ncsa.uiuc.edu + * + * Date: September 4, 2001 + * + *------------------------------------------------------------------------- + */ + +herr_t H5ATTRget_dims( hid_t obj_id, + const char *attr_name, + hsize_t *dims) +{ + hid_t attr_id; + hid_t space_id; + + /* Open the attribute. */ + if ( ( attr_id = H5Aopen_by_name(obj_id, ".", attr_name, + H5P_DEFAULT, H5P_DEFAULT) ) < 0 ) + { + return -1; + } + + /* Get the dataspace handle */ + if ( (space_id = H5Aget_space( attr_id )) < 0 ) + goto out; + + /* Get dimensions */ + if ( H5Sget_simple_extent_dims( space_id, dims, NULL) < 0 ) + goto out; + + /* Terminate access to the dataspace */ + if ( H5Sclose( space_id ) < 0 ) + goto out; + + /* End access to the attribute */ + if ( H5Aclose( attr_id ) ) + goto out; + + return 0; + +out: + H5Aclose( attr_id ); + return -1; +} diff --git a/src/H5ATTR.h b/src/H5ATTR.h new file mode 100644 index 0000000..65f23c3 --- /dev/null +++ b/src/H5ATTR.h @@ -0,0 +1,86 @@ + +/**************************************************************************** + * NCSA HDF * + * Scientific Data Technologies * + * National Center for Supercomputing Applications * + * University of Illinois at Urbana-Champaign * + * 605 E. Springfield, Champaign IL 61820 * + * * + * For conditions of distribution and use, see the accompanying * + * hdf/COPYING file. * + * * + * Modified versions of H5LT for getting and setting attributes for open + * groups and leaves. + * F. Alted 2005/09/29 + * * + ****************************************************************************/ + + +#include + +#ifdef __cplusplus +extern "C" { +#endif + + +/*------------------------------------------------------------------------- + * + * Set & get attribute functions + * + *------------------------------------------------------------------------- + */ + +herr_t H5ATTRset_attribute( hid_t obj_id, + const char *attr_name, + hid_t type_id, + size_t rank, + hsize_t *dims, + const char *attr_data ); + +herr_t H5ATTRset_attribute_string( hid_t obj_id, + const char *attr_name, + const char *attr_data, + hsize_t attr_size, + int cset ); + +herr_t H5ATTRget_attribute( hid_t loc_id, + const char *attr_name, + hid_t type_id, + void *data ); + +hsize_t H5ATTRget_attribute_string( hid_t obj_id, + const char *attr_name, + char **data, + int *cset ); + +hsize_t H5ATTRget_attribute_vlen_string_array( hid_t obj_id, + const char *attr_name, + char ***data, + int *cset ); + +/*------------------------------------------------------------------------- + * + * Query attribute functions + * + *------------------------------------------------------------------------- + */ + + +herr_t H5ATTRfind_attribute( hid_t loc_id, + const char* attr_name ); + +herr_t H5ATTRget_type_ndims( hid_t loc_id, + const char *attr_name, + hid_t *type_id, + H5T_class_t *class_id, + size_t *type_size, + int *rank ); + +herr_t H5ATTRget_dims( hid_t loc_id, + const char *attr_name, + hsize_t *dims ); + + +#ifdef __cplusplus +} +#endif diff --git a/src/H5TB-opt.c b/src/H5TB-opt.c new file mode 100644 index 0000000..9dfac04 --- /dev/null +++ b/src/H5TB-opt.c @@ -0,0 +1,697 @@ +/**************************************************************************** + * NCSA HDF * + * Scientific Data Technologies * + * National Center for Supercomputing Applications * + * University of Illinois at Urbana-Champaign * + * 605 E. Springfield, Champaign IL 61820 * + * * + * For conditions of distribution and use, see the accompanying * + * hdf/COPYING file. * + * * + ****************************************************************************/ + +/* WARNING: This is a highly stripped down and modified version of the + original H5TB.c that comes with the HDF5 library. These + modifications has been done in order to serve the needs of + PyTables, and specially for supporting nested datatypes. In + particular, the VERSION attribute is out of sync so it is not + guaranteed that the resulting PyTables objects will be identical + with those generated with HDF5_HL, although they should remain + largely compatibles. + + F. Alted 2005/06/09 + + Other modifications are that these routines are meant for opened + nodes, and do not spend time opening and closing datasets. + + F. Alted 2005/09/29 + + */ + +#include +#include + +#include "H5TB-opt.h" +#include "tables.h" +#include "H5Zlzo.h" /* Import FILTER_LZO */ +#include "H5Zbzip2.h" /* Import FILTER_BZIP2 */ +#include "blosc_filter.h" /* Import FILTER_BLOSC */ + +/* Define this in order to shrink datasets after deleting */ +#if 1 +#define SHRINK +#endif + +/*------------------------------------------------------------------------- + * + * Create functions + * + *------------------------------------------------------------------------- + */ + +/*------------------------------------------------------------------------- + * Function: H5TBmake_table + * + * Purpose: Make a table + * + * Return: Success: 0, Failure: -1 + * + * Programmer: Pedro Vicente, pvn@ncsa.uiuc.edu + * Quincey Koziol + * + * Date: January 17, 2001 + * + * Comments: The data is packed + * * Heavily modified and not compliant with attributes + * May 20, 2005 + * F. Alted + * + * Modifications: + * * Modified by A. Cobb. August 21, 2017 (track_times) + * + *------------------------------------------------------------------------- + */ + + +hid_t H5TBOmake_table( const char *table_title, + hid_t loc_id, + const char *dset_name, + char *version, + const char *class_, + hid_t type_id, + hsize_t nrecords, + hsize_t chunk_size, + void *fill_data, + int compress, + char *complib, + int shuffle, + int fletcher32, + hbool_t track_times, + const void *data ) +{ + + hid_t dataset_id; + hid_t space_id; + hid_t plist_id; + hsize_t dims[1]; + hsize_t dims_chunk[1]; + hsize_t maxdims[1] = { H5S_UNLIMITED }; + unsigned int cd_values[7]; + int blosc_compcode; + char *blosc_compname = NULL; + + dims[0] = nrecords; + dims_chunk[0] = chunk_size; + + /* Create a simple data space with unlimited size */ + if ( (space_id = H5Screate_simple( 1, dims, maxdims )) < 0 ) + return -1; + + /* Dataset creation properties */ + plist_id = H5Pcreate (H5P_DATASET_CREATE); + + /* Enable or disable recording dataset times */ + if ( H5Pset_obj_track_times( plist_id, track_times ) < 0 ) + return -1; + + /* Modify dataset creation properties, i.e. enable chunking */ + if ( H5Pset_chunk ( plist_id, 1, dims_chunk ) < 0 ) + return -1; + + /* Set the fill value using a struct as the data type. */ + if ( fill_data) + { + if ( H5Pset_fill_value( plist_id, type_id, fill_data ) < 0 ) + return -1; + } + else { + if ( H5Pset_fill_time(plist_id, H5D_FILL_TIME_ALLOC) < 0 ) + return -1; + } + + /* + Dataset creation property list is modified to use filters + */ + + /* Fletcher must be first */ + if (fletcher32) { + if ( H5Pset_fletcher32( plist_id) < 0 ) + return -1; + } + /* Then shuffle (blosc shuffles inplace) */ + if ((shuffle && compress) && (strncmp(complib, "blosc", 5) != 0)) { + if ( H5Pset_shuffle( plist_id) < 0 ) + return -1; + } + /* Finally compression */ + if ( compress ) + { + cd_values[0] = compress; + cd_values[1] = (int)(atof(version) * 10); + cd_values[2] = Table; + /* The default compressor in HDF5 (zlib) */ + if (strcmp(complib, "zlib") == 0) { + if ( H5Pset_deflate( plist_id, compress) < 0 ) + return -1; + } + /* The Blosc compressor does accept parameters */ + else if (strcmp(complib, "blosc") == 0) { + cd_values[4] = compress; + cd_values[5] = shuffle; + if ( H5Pset_filter( plist_id, FILTER_BLOSC, H5Z_FLAG_OPTIONAL, 6, cd_values) < 0 ) + return -1; + } + /* The Blosc compressor can use other compressors */ + else if (strncmp(complib, "blosc:", 6) == 0) { + cd_values[4] = compress; + cd_values[5] = shuffle; + blosc_compname = complib + 6; + blosc_compcode = blosc_compname_to_compcode(blosc_compname); + cd_values[6] = blosc_compcode; + if ( H5Pset_filter( plist_id, FILTER_BLOSC, H5Z_FLAG_OPTIONAL, 7, cd_values) < 0 ) + return -1; + } + /* The LZO compressor does accept parameters */ + else if (strcmp(complib, "lzo") == 0) { + if ( H5Pset_filter( plist_id, FILTER_LZO, H5Z_FLAG_OPTIONAL, 3, cd_values) < 0 ) + return -1; + } + /* The bzip2 compress does accept parameters */ + else if (strcmp(complib, "bzip2") == 0) { + if ( H5Pset_filter( plist_id, FILTER_BZIP2, H5Z_FLAG_OPTIONAL, 3, cd_values) < 0 ) + return -1; + } + else { + /* Compression library not supported */ + return -1; + } + + } + + /* Create the dataset. */ + if ( (dataset_id = H5Dcreate( loc_id, dset_name, type_id, space_id, + H5P_DEFAULT, plist_id, H5P_DEFAULT )) < 0 ) + goto out; + + /* Only write if there is something to write */ + if ( data ) + { + /* Write data to the dataset. */ + if ( H5Dwrite( dataset_id, type_id, H5S_ALL, H5S_ALL, H5P_DEFAULT, data ) < 0 ) + goto out; + + } + + /* Terminate access to the data space. */ + if ( H5Sclose( space_id ) < 0 ) + goto out; + + /* End access to the property list */ + if ( H5Pclose( plist_id ) < 0 ) + goto out; + + /* Return the object unique ID for future references */ + return dataset_id; + +/* error zone, gracefully close */ +out: + H5E_BEGIN_TRY { + H5Dclose(dataset_id); + H5Sclose(space_id); + H5Pclose(plist_id); + } H5E_END_TRY; + return -1; + +} + + +/*------------------------------------------------------------------------- + * Function: H5TBOread_records + * + * Purpose: Read records from an opened table + * + * Return: Success: 0, Failure: -1 + * + * Programmer: Francesc Alted, faltet@pytables.com + * + * Date: April 19, 2003 + * + * Comments: + * + * Modifications: + * + * + *------------------------------------------------------------------------- + */ + +herr_t H5TBOread_records( hid_t dataset_id, + hid_t mem_type_id, + hsize_t start, + hsize_t nrecords, + void *data ) +{ + + hid_t space_id; + hid_t mem_space_id; + hsize_t count[1]; + hsize_t offset[1]; + + /* Get the dataspace handle */ + if ( (space_id = H5Dget_space( dataset_id )) < 0 ) + goto out; + + /* Define a hyperslab in the dataset of the size of the records */ + offset[0] = start; + count[0] = nrecords; + if ( H5Sselect_hyperslab(space_id, H5S_SELECT_SET, offset, NULL, count, NULL) < 0 ) + goto out; + + /* Create a memory dataspace handle */ + if ( (mem_space_id = H5Screate_simple( 1, count, NULL )) < 0 ) + goto out; + + if ( H5Dread(dataset_id, mem_type_id, mem_space_id, space_id, H5P_DEFAULT, data ) < 0 ) + goto out; + + /* Terminate access to the memory dataspace */ + if ( H5Sclose( mem_space_id ) < 0 ) + goto out; + + /* Terminate access to the dataspace */ + if ( H5Sclose( space_id ) < 0 ) + goto out; + +return 0; + +out: + return -1; + +} + +/*------------------------------------------------------------------------- + * Function: H5TBOread_elements + * + * Purpose: Read selected records from an opened table + * + * Return: Success: 0, Failure: -1 + * + * Programmer: Francesc Alted, faltet@pytables.com + * + * Date: April 19, 2003 + * + * Comments: + * + * Modifications: + * + * + *------------------------------------------------------------------------- + */ + +herr_t H5TBOread_elements( hid_t dataset_id, + hid_t mem_type_id, + hsize_t nrecords, + void *coords, + void *data ) +{ + + hid_t space_id; + hid_t mem_space_id; + hsize_t count[1]; + + /* Get the dataspace handle */ + if ( (space_id = H5Dget_space( dataset_id )) < 0 ) + goto out; + + /* Define a selection of points in the dataset */ + + if ( H5Sselect_elements(space_id, H5S_SELECT_SET, (size_t)nrecords, (const hsize_t *)coords) < 0 ) + goto out; + + /* Create a memory dataspace handle */ + count[0] = nrecords; + if ( (mem_space_id = H5Screate_simple( 1, count, NULL )) < 0 ) + goto out; + + if ( H5Dread( dataset_id, mem_type_id, mem_space_id, space_id, H5P_DEFAULT, data ) < 0 ) + goto out; + + /* Terminate access to the memory dataspace */ + if ( H5Sclose( mem_space_id ) < 0 ) + goto out; + + /* Terminate access to the dataspace */ + if ( H5Sclose( space_id ) < 0 ) + goto out; + +return 0; + +out: + return -1; + +} + + +/*------------------------------------------------------------------------- + * Function: H5TBOappend_records + * + * Purpose: Appends records to a table + * + * Return: Success: 0, Failure: -1 + * + * Programmers: + * Francesc Alted, faltet@pytables.com + * + * Date: April 20, 2003 + * + * Comments: Uses memory offsets + * + * Modifications: + * + * + *------------------------------------------------------------------------- + */ + +herr_t H5TBOappend_records( hid_t dataset_id, + hid_t mem_type_id, + hsize_t nrecords, + hsize_t nrecords_orig, + const void *data ) +{ + hid_t space_id = -1; /* Shut up the compiler */ + hsize_t count[1]; + hsize_t offset[1]; + hid_t mem_space_id = -1; /* Shut up the compiler */ + hsize_t dims[1]; + + + /* Extend the dataset */ + dims[0] = nrecords_orig; + dims[0] += nrecords; + if ( H5Dset_extent(dataset_id, dims) < 0 ) + goto out; + + /* Create a simple memory data space */ + count[0]=nrecords; + if ( (mem_space_id = H5Screate_simple( 1, count, NULL )) < 0 ) + return -1; + + /* Get the file data space */ + if ( (space_id = H5Dget_space(dataset_id)) < 0 ) + return -1; + + /* Define a hyperslab in the dataset */ + offset[0] = nrecords_orig; + if ( H5Sselect_hyperslab( space_id, H5S_SELECT_SET, offset, NULL, count, NULL) < 0 ) + goto out; + + if ( H5Dwrite( dataset_id, mem_type_id, mem_space_id, space_id, H5P_DEFAULT, data ) < 0 ) + goto out; + + /* Terminate access to the dataspace */ + if ( H5Sclose( mem_space_id ) < 0 ) + goto out; + if ( H5Sclose( space_id ) < 0 ) + goto out; + +return 0; + +out: + return -1; + +} + +/*------------------------------------------------------------------------- + * Function: H5TBOwrite_records + * + * Purpose: Writes records + * + * Return: Success: 0, Failure: -1 + * + * Programmer: Pedro Vicente, pvn@ncsa.uiuc.edu + * + * Date: November 19, 2001 + * + * Comments: Uses memory offsets + * + * Modifications: + * - Added a step parameter in order to support strided writing. + * Francesc Alted, faltet@pytables.com. 2004-08-12 + * + * - Removed the type_size which was unnecessary + * Francesc Alted, 2005-10-25 + * + *------------------------------------------------------------------------- + */ + +herr_t H5TBOwrite_records( hid_t dataset_id, + hid_t mem_type_id, + hsize_t start, + hsize_t nrecords, + hsize_t step, + const void *data ) +{ + + hsize_t count[1]; + hsize_t stride[1]; + hsize_t offset[1]; + hid_t space_id; + hid_t mem_space_id; + hsize_t dims[1]; + + /* Get the dataspace handle */ + if ( (space_id = H5Dget_space( dataset_id )) < 0 ) + goto out; + + /* Get records */ + if ( H5Sget_simple_extent_dims( space_id, dims, NULL) < 0 ) + goto out; + +/* if ( start + nrecords > dims[0] ) */ + if ( start + (nrecords-1) * step + 1 > dims[0] ) + goto out; + + /* Define a hyperslab in the dataset of the size of the records */ + offset[0] = start; + stride[0] = step; + count[0] = nrecords; + if ( H5Sselect_hyperslab( space_id, H5S_SELECT_SET, offset, stride, count, NULL) < 0 ) + goto out; + + /* Create a memory dataspace handle */ + if ( (mem_space_id = H5Screate_simple( 1, count, NULL )) < 0 ) + goto out; + + if ( H5Dwrite( dataset_id, mem_type_id, mem_space_id, space_id, H5P_DEFAULT, data ) < 0 ) + goto out; + + /* Terminate access to the memory dataspace */ + if ( H5Sclose( mem_space_id ) < 0 ) + goto out; + + /* Terminate access to the dataspace */ + if ( H5Sclose( space_id ) < 0 ) + goto out; + +return 0; + +out: + return -1; + +} + +/*------------------------------------------------------------------------- + * Function: H5TBOwrite_elements + * + * Purpose: Writes records on a list of coordinates + * + * Return: Success: 0, Failure: -1 + * + * Programmer: Francesc Alted, + * + * Date: October 25, 2005 + * + * Comments: + * + * + *------------------------------------------------------------------------- + */ + +herr_t H5TBOwrite_elements( hid_t dataset_id, + hid_t mem_type_id, + hsize_t nrecords, + const void *coords, + const void *data ) +{ + + hsize_t count[1]; + hid_t space_id; + hid_t mem_space_id; + + /* Get the dataspace handle */ + if ( (space_id = H5Dget_space( dataset_id )) < 0 ) + goto out; + + /* Define a selection of points in the dataset */ + + if ( H5Sselect_elements(space_id, H5S_SELECT_SET, (size_t)nrecords, (const hsize_t *)coords) < 0 ) + goto out; + + /* Create a memory dataspace handle */ + count[0] = nrecords; + if ( (mem_space_id = H5Screate_simple( 1, count, NULL )) < 0 ) + goto out; + + if ( H5Dwrite( dataset_id, mem_type_id, mem_space_id, space_id, H5P_DEFAULT, data ) < 0 ) + goto out; + + /* Terminate access to the memory dataspace */ + if ( H5Sclose( mem_space_id ) < 0 ) + goto out; + + /* Terminate access to the dataspace */ + if ( H5Sclose( space_id ) < 0 ) + goto out; + +return 0; + +out: + return -1; + +} + + +/*------------------------------------------------------------------------- + * Function: H5TBOdelete_records + * + * Purpose: Delete records from middle of table ("pulling up" all the records after it) + * + * Return: Success: 0, Failure: -1 + * + * Programmer: Pedro Vicente, pvn@ncsa.uiuc.edu + * Modified by: F. Alted + * + * Date: November 26, 2001 + * + * Modifications: April 29, 2003 + * Modifications: February 19, 2004 (buffered rewriting of trailing rows) + * Modifications: September 28, 2005 (adapted to opened tables) + * + * + *------------------------------------------------------------------------- + */ + +herr_t H5TBOdelete_records( hid_t dataset_id, + hid_t mem_type_id, + hsize_t ntotal_records, + size_t src_size, + hsize_t start, + hsize_t nrecords, + hsize_t maxtuples) +{ + + hsize_t nrowsread; + hsize_t read_start; + hsize_t write_start; + hsize_t read_nrecords; + hsize_t count[1]; + hsize_t offset[1]; + hid_t space_id; + hid_t mem_space_id; + hsize_t mem_size[1]; + unsigned char *tmp_buf; + hsize_t dims[1]; + size_t read_nbuf; + + /* Shut the compiler up */ + tmp_buf = NULL; + +/*------------------------------------------------------------------------- + * Read the records after the deleted one(s) + *------------------------------------------------------------------------- + */ + + read_start = start + nrecords; + write_start = start; + read_nrecords = ntotal_records - read_start; + /* This check added for the case that there are no records to be read */ + /* F. Alted 2003/07/16 */ + if (read_nrecords > 0) { + nrowsread = 0; + + while (nrowsread < read_nrecords) { + + if (nrowsread + maxtuples < read_nrecords) + read_nbuf = (size_t)maxtuples; + else + read_nbuf = (size_t)(read_nrecords - nrowsread); + + tmp_buf = (unsigned char *)malloc(read_nbuf * src_size ); + + if ( tmp_buf == NULL ) + return -1; + + /* Read the records after the deleted one(s) */ + if ( H5TBOread_records(dataset_id, mem_type_id, read_start, + read_nbuf, tmp_buf ) < 0 ) + return -1; + +/*------------------------------------------------------------------------- + * Write the records in another position + *------------------------------------------------------------------------- + */ + + /* Get the dataspace handle */ + if ( (space_id = H5Dget_space( dataset_id )) < 0 ) + goto out; + + /* Define a hyperslab in the dataset of the size of the records */ + offset[0] = write_start; + count[0] = read_nbuf; + if ( H5Sselect_hyperslab( space_id, H5S_SELECT_SET, offset, NULL, count, NULL) < 0 ) + goto out; + + /* Create a memory dataspace handle */ + mem_size[0] = count[0]; + if ( (mem_space_id = H5Screate_simple( 1, mem_size, NULL )) < 0 ) + goto out; + + if ( H5Dwrite( dataset_id, mem_type_id, mem_space_id, space_id, H5P_DEFAULT, tmp_buf ) < 0 ) + goto out; + + /* Terminate access to the memory dataspace */ + if ( H5Sclose( mem_space_id ) < 0 ) + goto out; + + /* Release the reading buffer */ + free( tmp_buf ); + + /* Terminate access to the dataspace */ + if ( H5Sclose( space_id ) < 0 ) + goto out; + + /* Update the counters */ + read_start += read_nbuf; + write_start += read_nbuf; + nrowsread += read_nbuf; + } /* while (nrowsread < read_nrecords) */ + } /* if (nread_nrecords > 0) */ + + +/*------------------------------------------------------------------------- + * Change the table dimension + *------------------------------------------------------------------------- + */ + +#if defined (SHRINK) + dims[0] = (int)ntotal_records - (int)nrecords; + if ( H5Dset_extent( dataset_id, dims ) < 0 ) + goto out; +#endif + + return 0; + +out: + return -1; +} + + diff --git a/src/H5TB-opt.h b/src/H5TB-opt.h new file mode 100644 index 0000000..f90935b --- /dev/null +++ b/src/H5TB-opt.h @@ -0,0 +1,60 @@ +#include + +#ifdef __cplusplus +extern "C" { +#endif + +hid_t H5TBOmake_table( const char *table_title, + hid_t loc_id, + const char *dset_name, + char *version, + const char *class_, + hid_t type_id, + hsize_t nrecords, + hsize_t chunk_size, + void *fill_data, + int compress, + char *complib, + int shuffle, + int fletcher32, + hbool_t track_times, + const void *data ); + +herr_t H5TBOread_records( hid_t dataset_id, + hid_t mem_type_id, + hsize_t start, + hsize_t nrecords, + void *data ); + +herr_t H5TBOread_elements( hid_t dataset_id, + hid_t mem_type_id, + hsize_t nrecords, + void *coords, + void *data ); + +herr_t H5TBOappend_records( hid_t dataset_id, + hid_t mem_type_id, + hsize_t nrecords, + hsize_t nrecords_orig, + const void *data ); + +herr_t H5TBOwrite_records( hid_t dataset_id, + hid_t mem_type_id, + hsize_t start, + hsize_t nrecords, + hsize_t step, + const void *data ); + +herr_t H5TBOwrite_elements( hid_t dataset_id, + hid_t mem_type_id, + hsize_t nrecords, + const void *coords, + const void *data ); + +herr_t H5TBOdelete_records( hid_t dataset_id, + hid_t mem_type_id, + hsize_t ntotal_records, + size_t src_size, + hsize_t start, + hsize_t nrecords, + hsize_t maxtuples ); diff --git a/src/H5VLARRAY.c b/src/H5VLARRAY.c new file mode 100644 index 0000000..2113d0e --- /dev/null +++ b/src/H5VLARRAY.c @@ -0,0 +1,414 @@ +#include "H5ATTR.h" +#include "tables.h" +#include "utils.h" /* get_order */ +#include "H5Zlzo.h" /* Import FILTER_LZO */ +#include "H5Zbzip2.h" /* Import FILTER_BZIP2 */ +#include "blosc_filter.h" /* Import FILTER_BLOSC */ +#include +#include + + +/*------------------------------------------------------------------------- + * + * Public functions + * + *------------------------------------------------------------------------- + */ + + +/*------------------------------------------------------------------------- + * Function: H5VLARRAYmake + * + * Purpose: Creates and writes a dataset of a variable length type type_id + * + * Return: Success: 0, Failure: -1 + * + * Programmer: F. Alted + * + * Date: November 08, 2003 + * + * Comments: Modified by A. Cobb. August 21, 2017 (track_times) + * + *------------------------------------------------------------------------- + */ + +hid_t H5VLARRAYmake( hid_t loc_id, + const char *dset_name, + const char *obversion, + const int rank, + const hsize_t *dims, + hid_t type_id, + hsize_t chunk_size, + void *fill_data, + int compress, + char *complib, + int shuffle, + int fletcher32, + hbool_t track_times, + const void *data) +{ + + hvl_t vldata; + hid_t dataset_id, space_id, datatype, tid1; + hsize_t dataset_dims[1]; + hsize_t maxdims[1] = { H5S_UNLIMITED }; + hsize_t dims_chunk[1]; + hid_t plist_id; + unsigned int cd_values[7]; + int blosc_compcode; + char *blosc_compname = NULL; + + if (data) + /* if data, one row will be filled initially */ + dataset_dims[0] = 1; + else + /* no data, so no rows on dataset initally */ + dataset_dims[0] = 0; + + dims_chunk[0] = chunk_size; + + /* Fill the vldata estructure with the data to write */ + /* This is currectly not used */ + vldata.p = (void *)data; + vldata.len = 1; /* Only one array type to save */ + + /* Create a VL datatype */ + if (rank == 0) { + datatype = H5Tvlen_create(type_id); + } + else { + tid1 = H5Tarray_create(type_id, rank, dims); + datatype = H5Tvlen_create(tid1); + H5Tclose( tid1 ); /* Release resources */ + } + + /* The dataspace */ + space_id = H5Screate_simple( 1, dataset_dims, maxdims ); + + /* Dataset creation properties */ + plist_id = H5Pcreate (H5P_DATASET_CREATE); + + /* Enable or disable recording dataset times */ + if ( H5Pset_obj_track_times( plist_id, track_times ) < 0 ) + return -1; + + /* Modify dataset creation properties, i.e. enable chunking */ + if ( H5Pset_chunk ( plist_id, 1, dims_chunk ) < 0 ) + return -1; + + /* + Dataset creation property list is modified to use + */ + + /* Fletcher must be first */ + if (fletcher32) { + if ( H5Pset_fletcher32( plist_id) < 0 ) + return -1; + } + /* Then shuffle (blosc shuffles inplace) */ + if ((shuffle && compress) && (strncmp(complib, "blosc", 5) != 0)) { + if ( H5Pset_shuffle( plist_id) < 0 ) + return -1; + } + /* Finally compression */ + if (compress) { + cd_values[0] = compress; + cd_values[1] = (int)(atof(obversion) * 10); + cd_values[2] = VLArray; + /* The default compressor in HDF5 (zlib) */ + if (strcmp(complib, "zlib") == 0) { + if ( H5Pset_deflate( plist_id, compress) < 0 ) + return -1; + } + /* The Blosc compressor does accept parameters */ + else if (strcmp(complib, "blosc") == 0) { + cd_values[4] = compress; + cd_values[5] = shuffle; + if ( H5Pset_filter( plist_id, FILTER_BLOSC, H5Z_FLAG_OPTIONAL, 6, cd_values) < 0 ) + return -1; + } + /* The Blosc compressor can use other compressors */ + else if (strncmp(complib, "blosc:", 6) == 0) { + cd_values[4] = compress; + cd_values[5] = shuffle; + blosc_compname = complib + 6; + blosc_compcode = blosc_compname_to_compcode(blosc_compname); + cd_values[6] = blosc_compcode; + if ( H5Pset_filter( plist_id, FILTER_BLOSC, H5Z_FLAG_OPTIONAL, 7, cd_values) < 0 ) + return -1; + } + /* The LZO compressor does accept parameters */ + else if (strcmp(complib, "lzo") == 0) { + if ( H5Pset_filter( plist_id, FILTER_LZO, H5Z_FLAG_OPTIONAL, 3, cd_values) < 0 ) + return -1; + } + /* The bzip2 compress does accept parameters */ + else if (strcmp(complib, "bzip2") == 0) { + if ( H5Pset_filter( plist_id, FILTER_BZIP2, H5Z_FLAG_OPTIONAL, 3, cd_values) < 0 ) + return -1; + } + else { + /* Compression library not supported */ + fprintf(stderr, "Compression library not supported\n"); + return -1; + } + } + + /* Create the dataset. */ + if ((dataset_id = H5Dcreate(loc_id, dset_name, datatype, space_id, + H5P_DEFAULT, plist_id, H5P_DEFAULT )) < 0 ) + goto out; + + /* Write the dataset only if there is data to write */ + if (data) + if ( H5Dwrite( dataset_id, datatype, H5S_ALL, H5S_ALL, H5P_DEFAULT, &vldata ) < 0 ) + goto out; + + /* Terminate access to the data space. */ + if ( H5Sclose( space_id ) < 0 ) + return -1; + + /* Release the datatype in the case that it is not an atomic type */ + if ( H5Tclose( datatype ) < 0 ) + return -1; + + /* End access to the property list */ + if ( H5Pclose( plist_id ) < 0 ) + goto out; + + return dataset_id; + +out: + + return -1; + +} + +/*------------------------------------------------------------------------- + * Function: H5ARRAYappend_records + * + * Purpose: Appends records to an array + * + * Return: Success: 0, Failure: -1 + * + * Programmers: + * Francesc Alted + * + * Date: October 30, 2003 + * + * Comments: Uses memory offsets + * + * Modifications: + * + * + *------------------------------------------------------------------------- + */ + + +herr_t H5VLARRAYappend_records( hid_t dataset_id, + hid_t type_id, + int nobjects, + hsize_t nrecords, + const void *data ) +{ + + hid_t space_id; + hid_t mem_space_id; + hsize_t start[1]; + hsize_t dataset_dims[1]; + hsize_t dims_new[1] = {1}; /* Only a record on each append */ + hvl_t wdata; /* Information to write */ + + + /* Initialize VL data to write */ + wdata.p=(void *)data; + wdata.len=nobjects; + + /* Dimension for the new dataset */ + dataset_dims[0] = nrecords + 1; + + /* Extend the dataset */ + if ( H5Dset_extent( dataset_id, dataset_dims ) < 0 ) + goto out; + + /* Create a simple memory data space */ + if ( (mem_space_id = H5Screate_simple( 1, dims_new, NULL )) < 0 ) + return -1; + + /* Get the file data space */ + if ( (space_id = H5Dget_space( dataset_id )) < 0 ) + return -1; + + /* Define a hyperslab in the dataset */ + start[0] = nrecords; + if ( H5Sselect_hyperslab( space_id, H5S_SELECT_SET, start, NULL, dims_new, NULL) < 0 ) + goto out; + + if ( H5Dwrite( dataset_id, type_id, mem_space_id, space_id, H5P_DEFAULT, &wdata ) < 0 ) + goto out; + + /* Terminate access to the dataspace */ + if ( H5Sclose( space_id ) < 0 ) + goto out; + + if ( H5Sclose( mem_space_id ) < 0 ) + goto out; + +return 1; + +out: + return -1; + +} + + +/*------------------------------------------------------------------------- + * Function: H5ARRAYmodify_records + * + * Purpose: Modify records of an array + * + * Return: Success: 0, Failure: -1 + * + * Programmers: + * Francesc Alted + * + * Date: October 28, 2004 + * + * Comments: Uses memory offsets + * + * Modifications: + * + * + *------------------------------------------------------------------------- + */ + +herr_t H5VLARRAYmodify_records( hid_t dataset_id, + hid_t type_id, + hsize_t nrow, + int nobjects, + const void *data ) +{ + + hid_t space_id; + hid_t mem_space_id; + hsize_t start[1]; + hsize_t dims_new[1] = {1}; /* Only a record on each update */ + hvl_t wdata; /* Information to write */ + + /* Initialize VL data to write */ + wdata.p=(void *)data; + wdata.len=nobjects; + + /* Create a simple memory data space */ + if ( (mem_space_id = H5Screate_simple( 1, dims_new, NULL )) < 0 ) + return -1; + + /* Get the file data space */ + if ( (space_id = H5Dget_space( dataset_id )) < 0 ) + return -1; + + /* Define a hyperslab in the dataset */ + start[0] = nrow; + if ( H5Sselect_hyperslab( space_id, H5S_SELECT_SET, start, NULL, dims_new, NULL) < 0 ) + goto out; + + if ( H5Dwrite( dataset_id, type_id, mem_space_id, space_id, H5P_DEFAULT, &wdata ) < 0 ) + goto out; + + /* Terminate access to the dataspace */ + if ( H5Sclose( space_id ) < 0 ) + goto out; + + if ( H5Sclose( mem_space_id ) < 0 ) + goto out; + +return 1; + +out: + return -1; + +} + + +/*------------------------------------------------------------------------- + * Function: H5VLARRAYget_info + * + * Purpose: Gathers info about the VLEN type and other. + * + * Return: Success: 0, Failure: -1 + * + * Programmer: Francesc Alted + * + * Date: November 19, 2003 + * + *------------------------------------------------------------------------- + */ + +herr_t H5VLARRAYget_info( hid_t dataset_id, + hid_t type_id, + hsize_t *nrecords, + char *base_byteorder ) +{ + + hid_t space_id; + H5T_class_t base_class_id; + H5T_class_t atom_class_id; + hid_t atom_type_id; + hid_t base_type_id; + + + /* Get the dataspace handle */ + if ( (space_id = H5Dget_space( dataset_id )) < 0 ) + goto out; + + /* Get number of records (it should be rank-1) */ + if ( H5Sget_simple_extent_dims( space_id, nrecords, NULL) < 0 ) + goto out; + + /* Terminate access to the dataspace */ + if ( H5Sclose( space_id ) < 0 ) + goto out; + + /* Get the type of the atomic component */ + atom_type_id = H5Tget_super( type_id ); + + /* Get the class of the atomic component. */ + atom_class_id = H5Tget_class( atom_type_id ); + + /* Check whether the atom is an array class object or not */ + if ( atom_class_id == H5T_ARRAY) { + /* Get the array base component */ + base_type_id = H5Tget_super( atom_type_id ); + /* Get the class of base component */ + base_class_id = H5Tget_class( base_type_id ); + /* Release the datatypes */ + if ( H5Tclose(atom_type_id ) ) + goto out; + } + else { + base_class_id = atom_class_id; + base_type_id = atom_type_id; + } + + /* Get the byteorder */ + /* Only integer, float and time classes can be byteordered */ + if ((base_class_id == H5T_INTEGER) || (base_class_id == H5T_FLOAT) + || (base_class_id == H5T_BITFIELD) || (base_class_id == H5T_COMPOUND) + || (base_class_id == H5T_TIME)) { + get_order(base_type_id, base_byteorder); + } + else { + strcpy(base_byteorder, "irrelevant"); + } + + /* Release the datatypes */ + if ( H5Tclose(base_type_id ) ) + goto out; + + return 0; + +out: + return -1; + +} + diff --git a/src/H5VLARRAY.h b/src/H5VLARRAY.h new file mode 100644 index 0000000..a4e35df --- /dev/null +++ b/src/H5VLARRAY.h @@ -0,0 +1,47 @@ +#ifndef _H5VLARRAY_H +#define _H5VLARRAY_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +hid_t H5VLARRAYmake( hid_t loc_id, + const char *dset_name, + const char *obversion, + const int rank, + const hsize_t *dims, + hid_t type_id, + hsize_t chunk_size, + void *fill_data, + int compress, + char *complib, + int shuffle, + int fletcher32, + hbool_t track_times, + const void *data); + +herr_t H5VLARRAYappend_records( hid_t dataset_id, + hid_t type_id, + int nobjects, + hsize_t nrecords, + const void *data ); + +herr_t H5VLARRAYmodify_records( hid_t dataset_id, + hid_t type_id, + hsize_t nrow, + int nobjects, + const void *data ); + +herr_t H5VLARRAYget_info( hid_t dataset_id, + hid_t type_id, + hsize_t *nrecords, + char *base_byteorder); + + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/H5Zbzip2.c b/src/H5Zbzip2.c new file mode 100644 index 0000000..5c2de2e --- /dev/null +++ b/src/H5Zbzip2.c @@ -0,0 +1,194 @@ +#include "H5Zbzip2.h" + +#include +#include +#include +#include +#include +#include + + +#ifdef HAVE_BZ2_LIB +#include "bzlib.h" +#endif /* defined HAVE_BZ2_LIB */ + +size_t bzip2_deflate(unsigned int flags, size_t cd_nelmts, + const unsigned int cd_values[], size_t nbytes, + size_t *buf_size, void **buf); + + +int register_bzip2(char **version, char **date) +{ +#ifdef HAVE_BZ2_LIB + char *libver, *versionp, *datep, *sep; + + H5Z_class_t filter_class = { + H5Z_CLASS_T_VERS, /* H5Z_class_t version */ + (H5Z_filter_t)(FILTER_BZIP2), /* filter_id */ + 1, 1, /* Encoding and decoding enabled */ + "bzip2", /* comment */ + NULL, /* can_apply_func */ + NULL, /* set_local_func */ + (H5Z_func_t)(bzip2_deflate) /* filter_func */ + }; + + /* Register the filter class for the bzip2 compressor. */ + H5Zregister(&filter_class); + + /* Get the library major version from the version string. */ + libver = strdup(BZ2_bzlibVersion()); + sep = strchr(libver, ','); + assert(sep != NULL); + assert(*(sep + 1) == ' '); + *sep = '\0'; + versionp = libver; + datep = sep + 2; /* after the comma and a space */ + + *version = strdup(versionp); + *date = strdup(datep); + + free(libver); + return 1; /* library is available */ + +#else + return 0; /* library is not available */ +#endif /* defined HAVE_BZ2_LIB */ + +} + + +size_t bzip2_deflate(unsigned int flags, size_t cd_nelmts, + const unsigned int cd_values[], size_t nbytes, + size_t *buf_size, void **buf) +{ +#ifdef HAVE_BZ2_LIB + char *outbuf = NULL; + size_t outbuflen, outdatalen; + int ret; + + if (flags & H5Z_FLAG_REVERSE) { + + /** Decompress data. + ** + ** This process is troublesome since the size of uncompressed data + ** is unknown, so the low-level interface must be used. + ** Data is decompressed to the output buffer (which is sized + ** for the average case); if it gets full, its size is doubled + ** and decompression continues. This avoids repeatedly trying to + ** decompress the whole block, which could be really inefficient. + **/ + + bz_stream stream; + char *newbuf = NULL; + size_t newbuflen; + + /* Prepare the output buffer. */ + outbuflen = nbytes * 3 + 1; /* average bzip2 compression ratio is 3:1 */ + outbuf = malloc(outbuflen); + if (outbuf == NULL) { + fprintf(stderr, "memory allocation failed for bzip2 decompression\n"); + goto cleanupAndFail; + } + + /* Use standard malloc()/free() for internal memory handling. */ + stream.bzalloc = NULL; + stream.bzfree = NULL; + stream.opaque = NULL; + + /* Start decompression. */ + ret = BZ2_bzDecompressInit(&stream, 0, 0); + if (ret != BZ_OK) { + fprintf(stderr, "bzip2 decompression start failed with error %d\n", ret); + goto cleanupAndFail; + } + + /* Feed data to the decompression process and get decompressed data. */ + stream.next_out = outbuf; + stream.avail_out = outbuflen; + stream.next_in = *buf; + stream.avail_in = nbytes; + do { + ret = BZ2_bzDecompress(&stream); + if (ret < 0) { + fprintf(stderr, "BUG: bzip2 decompression failed with error %d\n", ret); + goto cleanupAndFail; + } + + if (ret != BZ_STREAM_END && stream.avail_out == 0) { + /* Grow the output buffer. */ + newbuflen = outbuflen * 2; + newbuf = realloc(outbuf, newbuflen); + if (newbuf == NULL) { + fprintf(stderr, "memory allocation failed for bzip2 decompression\n"); + goto cleanupAndFail; + } + stream.next_out = newbuf + outbuflen; /* half the new buffer behind */ + stream.avail_out = outbuflen; /* half the new buffer ahead */ + outbuf = newbuf; + outbuflen = newbuflen; + } + } while (ret != BZ_STREAM_END); + + /* End compression. */ + outdatalen = stream.total_out_lo32; + ret = BZ2_bzDecompressEnd(&stream); + if (ret != BZ_OK) { + fprintf(stderr, "bzip2 compression end failed with error %d\n", ret); + goto cleanupAndFail; + } + + } else { + + /** Compress data. + ** + ** This is quite simple, since the size of compressed data in the worst + ** case is known and it is not much bigger than the size of uncompressed + ** data. This allows us to use the simplified one-shot interface to + ** compression. + **/ + + unsigned int odatalen; /* maybe not the same size as outdatalen */ + int blockSize100k = 9; + + /* Get compression block size if present. */ + if (cd_nelmts > 0) { + blockSize100k = cd_values[0]; + if (blockSize100k < 1 || blockSize100k > 9) { + fprintf(stderr, "invalid compression block size: %d\n", blockSize100k); + goto cleanupAndFail; + } + } + + /* Prepare the output buffer. */ + outbuflen = nbytes + nbytes / 100 + 600; /* worst case (bzip2 docs) */ + outbuf = malloc(outbuflen); + if (outbuf == NULL) { + fprintf(stderr, "memory allocation failed for bzip2 compression\n"); + goto cleanupAndFail; + } + + /* Compress data. */ + odatalen = outbuflen; + ret = BZ2_bzBuffToBuffCompress(outbuf, &odatalen, *buf, nbytes, + blockSize100k, 0, 0); + outdatalen = odatalen; + if (ret != BZ_OK) { + fprintf(stderr, "bzip2 compression failed with error %d\n", ret); + goto cleanupAndFail; + } + } + + /* Always replace the input buffer with the output buffer. */ + free(*buf); + *buf = outbuf; + *buf_size = outbuflen; + return outdatalen; + + cleanupAndFail: + if (outbuf) + free(outbuf); + return 0; +#else + return 0; +#endif /* defined HAVE_BZ2_LIB */ +} diff --git a/src/H5Zbzip2.h b/src/H5Zbzip2.h new file mode 100644 index 0000000..73ebc97 --- /dev/null +++ b/src/H5Zbzip2.h @@ -0,0 +1,7 @@ +#ifndef __H5ZBZIP2_H__ +#define __H5ZBZIP2_H__ 1 + +#define FILTER_BZIP2 307 +int register_bzip2(char **version, char **date); + +#endif /* ! defined __H5ZBZIP2_H__ */ diff --git a/src/H5Zlzo.c b/src/H5Zlzo.c new file mode 100644 index 0000000..11d168d --- /dev/null +++ b/src/H5Zlzo.c @@ -0,0 +1,305 @@ +#include +#include +#include + +#include "H5Zlzo.h" +#include "tables.h" + +#ifdef HAVE_LZO_LIB +# include "lzo1x.h" +#endif +#ifdef HAVE_LZO2_LIB +# include "lzo/lzo1x.h" +# define HAVE_LZO_LIB /* The API for LZO and LZO2 is mostly identical */ +#endif + +/* #undef DEBUG */ + +/* Activate the checksum. It is safer and takes only a 1% more of + space and a 2% more of CPU (but sometimes is faster than without + checksum, which is almost negligible. F. Alted 2003/07/22 + + Added code for pytables 0.5 backward compatibility. + F. Alted 2003/07/28 + + Added code for saving the uncompressed length buffer as well. + F. Alted 2003/07/29 + +*/ + +/* From pytables 0.8 on I decided to let the user select the + fletcher32 checksum provided in HDF5 1.6 or higher. So, even though + the CHECKSUM support here seems pretty stable it will be disabled. + F. Alted 2004/01/02 */ +#undef CHECKSUM + +size_t lzo_deflate (unsigned flags, size_t cd_nelmts, + const unsigned cd_values[], size_t nbytes, + size_t *buf_size, void **buf); + + +int register_lzo(char **version, char **date) { + +#ifdef HAVE_LZO_LIB + + H5Z_class_t filter_class = { + H5Z_CLASS_T_VERS, /* H5Z_class_t version */ + (H5Z_filter_t)(FILTER_LZO), /* filter_id */ + 1, 1, /* Encoding and decoding enabled */ + "lzo", /* comment */ + NULL, /* can_apply_func */ + NULL, /* set_local_func */ + (H5Z_func_t)(lzo_deflate) /* filter_func */ + }; + + /* Init the LZO library */ + if (lzo_init()!=LZO_E_OK) { + fprintf(stderr, "Problems initializing LZO library\n"); + *version = NULL; + *date = NULL; + return 0; /* lib is not available */ + } + + /* Register the lzo compressor */ + H5Zregister(&filter_class); + + *version = strdup(LZO_VERSION_STRING); + *date = strdup(LZO_VERSION_DATE); + return 1; /* lib is available */ + +#else + *version = NULL; + *date = NULL; + return 0; /* lib is not available */ +#endif /* HAVE_LZO_LIB */ + +} + + +size_t lzo_deflate (unsigned flags, size_t cd_nelmts, + const unsigned cd_values[], size_t nbytes, + size_t *buf_size, void **buf) +{ + size_t ret_value = 0; +#ifdef HAVE_LZO_LIB + void *outbuf = NULL, *wrkmem = NULL; + int status; + size_t nalloc = *buf_size; + lzo_uint out_len = (lzo_uint) nalloc; + /* max_len_buffer will keep the likely output buffer size + after processing the first chunk */ + static unsigned int max_len_buffer = 0; + /* int complevel = 1; */ +#if (defined CHECKSUM || defined DEBUG) + int object_version = 10; /* Default version 1.0 */ + int object_type = Table; /* Default object type */ +#endif +#ifdef CHECKSUM + lzo_uint32 checksum; +#endif + + /* Check arguments */ + /* For Table versions < 20, there were no parameters */ + if (cd_nelmts==1 ) { + /* complevel = cd_values[0]; */ /* This do nothing right now */ + } + else if (cd_nelmts==2 ) { + /* complevel = cd_values[0]; */ /* This do nothing right now */ +#if (defined CHECKSUM || defined DEBUG) + object_version = cd_values[1]; /* The table VERSION attribute */ +#endif + } + else if (cd_nelmts==3 ) { + /* complevel = cd_values[0]; */ /* This do nothing right now */ +#if (defined CHECKSUM || defined DEBUG) + object_version = cd_values[1]; /* The table VERSION attribute */ + object_type = cd_values[2]; /* A tag for identifying the object + (see tables.h) */ +#endif + } + +#ifdef DEBUG + printf("Object type: %d. ", object_type); + printf("object_version:%d\n", object_version); +#endif + + if (flags & H5Z_FLAG_REVERSE) { + /* Input */ + +/* printf("Decompressing chunk with LZO\n"); */ +#ifdef CHECKSUM + if ((object_type == Table && object_version >= 20) || + object_type != Table) { + nbytes -= 4; /* Point to uncompressed buffer length */ + memcpy(&nalloc, ((unsigned char *)(*buf)+nbytes), 4); + out_len = nalloc; + nbytes -= 4; /* Point to the checksum */ +#ifdef DEBUG + printf("Compressed bytes: %d. Uncompressed bytes: %d\n", nbytes, nalloc); +#endif + } +#endif + + /* Only allocate the bytes for the outbuf */ + if (max_len_buffer == 0) { + if (NULL==(outbuf = (void *)malloc(nalloc))) + fprintf(stderr, "Memory allocation failed for lzo uncompression.\n"); + } + else { + if (NULL==(outbuf = (void *)malloc(max_len_buffer))) + fprintf(stderr, "Memory allocation failed for lzo uncompression.\n"); + out_len = max_len_buffer; + nalloc = max_len_buffer; + } + + while(1) { + +#ifdef DEBUG + printf("nbytes -->%d\n", nbytes); + printf("nalloc -->%d\n", nalloc); + printf("max_len_buffer -->%d\n", max_len_buffer); +#endif /* DEBUG */ + + /* The assembler version is a 10% slower than the C version with + gcc 3.2.2 and gcc 3.3.3 */ +/* status = lzo1x_decompress_asm_safe(*buf, (lzo_uint)nbytes, outbuf, */ +/* &out_len, NULL); */ + /* The safe and unsafe versions have the same speed more or less */ + status = lzo1x_decompress_safe(*buf, (lzo_uint)nbytes, outbuf, + &out_len, NULL); + + if (status == LZO_E_OK) { +#ifdef DEBUG + printf("decompressed %lu bytes back into %lu bytes\n", + (long) nbytes, (long) out_len); +#endif + max_len_buffer = out_len; + break; /* done */ + } + else if (status == LZO_E_OUTPUT_OVERRUN) { + nalloc *= 2; + out_len = (lzo_uint) nalloc; + if (NULL==(outbuf = realloc(outbuf, nalloc))) { + fprintf(stderr, "Memory allocation failed for lzo uncompression\n"); + } + } + else { + /* this should NEVER happen */ + fprintf(stderr, "internal error - decompression failed: %d\n", status); + ret_value = 0; /* fail */ + goto done; + } + } + +#ifdef CHECKSUM + if ((object_type == Table && object_version >= 20) || + object_type != Table) { +#ifdef DEBUG + printf("Checksum uncompressing..."); +#endif + /* Compute the checksum */ + checksum=lzo_adler32(lzo_adler32(0,NULL,0), outbuf, out_len); + + /* Compare */ + if (memcmp(&checksum, (unsigned char*)(*buf)+nbytes, 4)) { + ret_value = 0; /*fail*/ + fprintf(stderr,"Checksum failed!.\n"); + goto done; + } + } +#endif /* CHECKSUM */ + + free(*buf); + *buf = outbuf; + outbuf = NULL; + *buf_size = nalloc; + ret_value = out_len; + + } else { + /* + * Output; compress but fail if the result would be larger than the + * input. The library doesn't provide in-place compression, so we + * must allocate a separate buffer for the result. + */ + lzo_byte *z_src = (lzo_byte*)(*buf); + lzo_byte *z_dst; /*destination buffer */ + lzo_uint z_src_nbytes = (lzo_uint)(nbytes); + /* The next was the original computation for worst-case expansion */ + /* I don't know why the difference with LZO1*. Perhaps some wrong docs in + LZO package? */ +/* lzo_uint z_dst_nbytes = (lzo_uint)(nbytes + (nbytes / 64) + 16 + 3); */ + /* The next is for LZO1* algorithms */ +/* lzo_uint z_dst_nbytes = (lzo_uint)(nbytes + (nbytes / 16) + 64 + 3); */ + /* The next is for LZO2* algorithms. This will be the default */ + lzo_uint z_dst_nbytes = (lzo_uint)(nbytes + (nbytes / 8) + 128 + 3); + +#ifdef CHECKSUM + if ((object_type == Table && object_version >= 20) || + object_type != Table) { + z_dst_nbytes += 4+4; /* Checksum + buffer size */ + } +#endif + + if (NULL==(z_dst=outbuf=(void *)malloc(z_dst_nbytes))) { + fprintf(stderr, "Unable to allocate lzo destination buffer.\n"); + ret_value = 0; /* fail */ + goto done; + } + + /* Compress this buffer */ + wrkmem = malloc(LZO1X_1_MEM_COMPRESS); + if (wrkmem == NULL) { + fprintf(stderr, "Memory allocation failed for lzo compression\n"); + ret_value = 0; + goto done; + } + + status = lzo1x_1_compress (z_src, z_src_nbytes, z_dst, &z_dst_nbytes, + wrkmem); + + free(wrkmem); + wrkmem = NULL; + +#ifdef CHECKSUM + if ((object_type == Table && object_version >= 20) || + object_type != Table) { +#ifdef DEBUG + printf("Checksum compressing ..."); + printf("src_nbytes: %d, dst_nbytes: %d\n", z_src_nbytes, z_dst_nbytes); +#endif + /* Append checksum of *uncompressed* data at the end */ + checksum = lzo_adler32(lzo_adler32(0,NULL,0), *buf, nbytes); + memcpy((unsigned char*)(z_dst)+z_dst_nbytes, &checksum, 4); + memcpy((unsigned char*)(z_dst)+z_dst_nbytes+4, &nbytes, 4); + z_dst_nbytes += (lzo_uint)4+4; + nbytes += 4+4; + } +#endif + + if (z_dst_nbytes >= nbytes) { +#ifdef DEBUG + printf("The compressed buffer takes more space than uncompressed!.\n"); +#endif + ret_value = 0; /* fail */ + goto done; + } else if (LZO_E_OK != status) { + fprintf(stderr,"lzo library error in compression\n"); + ret_value = 0; /* fail */ + goto done; + } else { + free(*buf); + *buf = outbuf; + outbuf = NULL; + *buf_size = z_dst_nbytes; + ret_value = z_dst_nbytes; + } + } + +done: + if(outbuf) + free(outbuf); + +#endif /* HAVE_LZO_LIB */ + + return ret_value; +} diff --git a/src/H5Zlzo.h b/src/H5Zlzo.h new file mode 100644 index 0000000..e536cf3 --- /dev/null +++ b/src/H5Zlzo.h @@ -0,0 +1,7 @@ +#ifndef __H5ZLZO_H__ +#define __H5ZLZO_H__ 1 + +#define FILTER_LZO 305 +int register_lzo(char **version, char **date); + +#endif /* ! defined __H5ZLZO_H__ */ diff --git a/src/Makefile b/src/Makefile new file mode 100644 index 0000000..1c21ab3 --- /dev/null +++ b/src/Makefile @@ -0,0 +1,16 @@ +VERSION = $(shell cat ../VERSION) + +# All the generated files +GENERATED = version.h + +.PHONY: all clean distclean + +all: $(GENERATED) + +clean: + rm -f $(GENERATED) + +distclean: clean + +version.h: version.h.in ../VERSION + cat "$<" | sed -e 's/@VERSION@/$(VERSION)/g' > "$@" diff --git a/src/idx-opt.c b/src/idx-opt.c new file mode 100644 index 0000000..a3f3138 --- /dev/null +++ b/src/idx-opt.c @@ -0,0 +1,393 @@ +#include + +/* See https://numpy.org/doc/1.17/reference/c-api.array.html#c.NO_IMPORT_ARRAY */ +#define NO_IMPORT_ARRAY +#include "idx-opt.h" + +/*------------------------------------------------------------------------- + * + * Binary search functions + * + *------------------------------------------------------------------------- + */ + + +/*------------------------------------------------------------------------- + * Function: bisect_{left,right}_optim_* + * + * Purpose: Look-up for a value in sorted arrays + * + * Return: The index of the value in array + * + * Programmer: Francesc Alted + * + * Date: August, 2005 + * + * Comments: + * + * Modifications: + * + *------------------------------------------------------------------------- + */ + + +/* Optimised version for left/int8 */ +int bisect_left_b(npy_int8 *a, long x, int hi, int offset) { + int lo = 0; + int mid; + + if (x <= a[offset]) return 0; + if (a[hi-1+offset] < x) return hi; + while (lo < hi) { + mid = lo + (hi-lo)/2; + if (a[mid+offset] < x) lo = mid+1; + else hi = mid; + } + return lo; +} + +/* Optimised version for left/uint8 */ +int bisect_left_ub(npy_uint8 *a, long x, int hi, int offset) { + int lo = 0; + int mid; + + if (x <= a[offset]) return 0; + if (a[hi-1+offset] < x) return hi; + while (lo < hi) { + mid = lo + (hi-lo)/2; + if (a[mid+offset] < x) lo = mid+1; + else hi = mid; + } + return lo; +} + +/* Optimised version for right/int8 */ +int bisect_right_b(npy_int8 *a, long x, int hi, int offset) { + int lo = 0; + int mid; + + if (x < a[offset]) return 0; + if (a[hi-1+offset] <= x) return hi; + while (lo < hi) { + mid = lo + (hi-lo)/2; + if (x < a[mid+offset]) hi = mid; + else lo = mid+1; + } + return lo; +} + +/* Optimised version for right/uint8 */ +int bisect_right_ub(npy_uint8 *a, long x, int hi, int offset) { + int lo = 0; + int mid; + + if (x < a[offset]) return 0; + if (a[hi-1+offset] <= x) return hi; + while (lo < hi) { + mid = lo + (hi-lo)/2; + if (x < a[mid+offset]) hi = mid; + else lo = mid+1; + } + return lo; +} + +/* Optimised version for left/int16 */ +int bisect_left_s(npy_int16 *a, long x, int hi, int offset) { + int lo = 0; + int mid; + + if (x <= a[offset]) return 0; + if (a[hi-1+offset] < x) return hi; + while (lo < hi) { + mid = lo + (hi-lo)/2; + if (a[mid+offset] < x) lo = mid+1; + else hi = mid; + } + return lo; +} + +/* Optimised version for left/uint16 */ +int bisect_left_us(npy_uint16 *a, long x, int hi, int offset) { + int lo = 0; + int mid; + + if (x <= a[offset]) return 0; + if (a[hi-1+offset] < x) return hi; + while (lo < hi) { + mid = lo + (hi-lo)/2; + if (a[mid+offset] < x) lo = mid+1; + else hi = mid; + } + return lo; +} + +/* Optimised version for right/int16 */ +int bisect_right_s(npy_int16 *a, long x, int hi, int offset) { + int lo = 0; + int mid; + + if (x < a[offset]) return 0; + if (a[hi-1+offset] <= x) return hi; + while (lo < hi) { + mid = lo + (hi-lo)/2; + if (x < a[mid+offset]) hi = mid; + else lo = mid+1; + } + return lo; +} + +/* Optimised version for right/uint16 */ +int bisect_right_us(npy_uint16 *a, long x, int hi, int offset) { + int lo = 0; + int mid; + + if (x < a[offset]) return 0; + if (a[hi-1+offset] <= x) return hi; + while (lo < hi) { + mid = lo + (hi-lo)/2; + if (x < a[mid+offset]) hi = mid; + else lo = mid+1; + } + return lo; +} + +/* Optimised version for left/int32 */ +int bisect_left_i(npy_int32 *a, long x, int hi, int offset) { + int lo = 0; + int mid; + + if (x <= a[offset]) return 0; + if (a[hi-1+offset] < x) return hi; + while (lo < hi) { + mid = lo + (hi-lo)/2; + if (a[mid+offset] < x) lo = mid+1; + else hi = mid; + } + return lo; +} + +/* Optimised version for left/uint32 */ +int bisect_left_ui(npy_uint32 *a, npy_uint32 x, int hi, int offset) { + int lo = 0; + int mid; + + if (x <= a[offset]) return 0; + if (a[hi-1+offset] < x) return hi; + while (lo < hi) { + mid = lo + (hi-lo)/2; + if (a[mid+offset] < x) lo = mid+1; + else hi = mid; + } + return lo; +} + +/* Optimised version for right/int32 */ +int bisect_right_i(npy_int32 *a, long x, int hi, int offset) { + int lo = 0; + int mid; + + if (x < a[offset]) return 0; + if (a[hi-1+offset] <= x) return hi; + while (lo < hi) { + mid = lo + (hi-lo)/2; + if (x < a[mid+offset]) hi = mid; + else lo = mid+1; + } + return lo; +} + +/* Optimised version for right/uint32 */ +int bisect_right_ui(npy_uint32 *a, npy_uint32 x, int hi, int offset) { + int lo = 0; + int mid; + + if (x < a[offset]) return 0; + if (a[hi-1+offset] <= x) return hi; + while (lo < hi) { + mid = lo + (hi-lo)/2; + if (x < a[mid+offset]) hi = mid; + else lo = mid+1; + } + return lo; +} + +/* Optimised version for left/int64 */ +int bisect_left_ll(npy_int64 *a, npy_int64 x, int hi, int offset) { + int lo = 0; + int mid; + + if (x <= a[offset]) return 0; + if (a[hi-1+offset] < x) return hi; + while (lo < hi) { + mid = lo + (hi-lo)/2; + if (a[mid+offset] < x) lo = mid+1; + else hi = mid; + } + return lo; +} + +/* Optimised version for left/uint64 */ +int bisect_left_ull(npy_uint64 *a, npy_uint64 x, int hi, int offset) { + int lo = 0; + int mid; + + if (x <= a[offset]) return 0; + if (a[hi-1+offset] < x) return hi; + while (lo < hi) { + mid = lo + (hi-lo)/2; + if (a[mid+offset] < x) lo = mid+1; + else hi = mid; + } + return lo; +} + +/* Optimised version for right/int64 */ +int bisect_right_ll(npy_int64 *a, npy_int64 x, int hi, int offset) { + int lo = 0; + int mid; + + if (x < a[offset]) return 0; + if (a[hi-1+offset] <= x) return hi; + while (lo < hi) { + mid = lo + (hi-lo)/2; + if (x < a[mid+offset]) hi = mid; + else lo = mid+1; + } + return lo; +} + +/* Optimised version for right/uint64 */ +int bisect_right_ull(npy_uint64 *a, npy_uint64 x, int hi, int offset) { + int lo = 0; + int mid; + + if (x < a[offset]) return 0; + if (a[hi-1+offset] <= x) return hi; + while (lo < hi) { + mid = lo + (hi-lo)/2; + if (x < a[mid+offset]) hi = mid; + else lo = mid+1; + } + return lo; +} + +/* Optimised version for left/float16 */ +int bisect_left_e(npy_float16 *a, npy_float64 x, int hi, int offset) { + int lo = 0; + int mid; + + if (x <= a[offset]) return 0; + if (a[hi-1+offset] < x) return hi; + while (lo < hi) { + mid = lo + (hi-lo)/2; + if (a[mid+offset] < x) lo = mid+1; + else hi = mid; + } + return lo; +} + +/* Optimised version for right/float16 */ +int bisect_right_e(npy_float16 *a, npy_float64 x, int hi, int offset) { + int lo = 0; + int mid; + + if (x < a[offset]) return 0; + if (a[hi-1+offset] <= x) return hi; + while (lo < hi) { + mid = lo + (hi-lo)/2; + if (x < a[mid+offset]) hi = mid; + else lo = mid+1; + } + return lo; +} + +/* Optimised version for left/float32 */ +int bisect_left_f(npy_float32 *a, npy_float64 x, int hi, int offset) { + int lo = 0; + int mid; + + if (x <= a[offset]) return 0; + if (a[hi-1+offset] < x) return hi; + while (lo < hi) { + mid = lo + (hi-lo)/2; + if (a[mid+offset] < x) lo = mid+1; + else hi = mid; + } + return lo; +} + +/* Optimised version for right/float32 */ +int bisect_right_f(npy_float32 *a, npy_float64 x, int hi, int offset) { + int lo = 0; + int mid; + + if (x < a[offset]) return 0; + if (a[hi-1+offset] <= x) return hi; + while (lo < hi) { + mid = lo + (hi-lo)/2; + if (x < a[mid+offset]) hi = mid; + else lo = mid+1; + } + return lo; +} + +/* Optimised version for left/float64 */ +int bisect_left_d(npy_float64 *a, npy_float64 x, int hi, int offset) { + int lo = 0; + int mid; + + if (x <= a[offset]) return 0; + if (a[hi-1+offset] < x) return hi; + while (lo < hi) { + mid = lo + (hi-lo)/2; + if (a[mid+offset] < x) lo = mid+1; + else hi = mid; + } + return lo; +} + +/* Optimised version for right/float64 */ +int bisect_right_d(npy_float64 *a, npy_float64 x, int hi, int offset) { + int lo = 0; + int mid; + + if (x < a[offset]) return 0; + if (a[hi-1+offset] <= x) return hi; + while (lo < hi) { + mid = lo + (hi-lo)/2; + if (x < a[mid+offset]) hi = mid; + else lo = mid+1; + } + return lo; +} + +/* Optimised version for left/longdouble */ +int bisect_left_g(npy_longdouble *a, npy_longdouble x, int hi, int offset) { + int lo = 0; + int mid; + + if (x <= a[offset]) return 0; + if (a[hi-1+offset] < x) return hi; + while (lo < hi) { + mid = lo + (hi-lo)/2; + if (a[mid+offset] < x) lo = mid+1; + else hi = mid; + } + return lo; +} + +/* Optimised version for right/longdouble */ +int bisect_right_g(npy_longdouble *a, npy_longdouble x, int hi, int offset) { + int lo = 0; + int mid; + + if (x < a[offset]) return 0; + if (a[hi-1+offset] <= x) return hi; + while (lo < hi) { + mid = lo + (hi-lo)/2; + if (x < a[mid+offset]) hi = mid; + else lo = mid+1; + } + return lo; +} + diff --git a/src/idx-opt.h b/src/idx-opt.h new file mode 100644 index 0000000..ed9b723 --- /dev/null +++ b/src/idx-opt.h @@ -0,0 +1,46 @@ +#include +#include + +#ifndef NPY_FLOAT16 +typedef npy_uint16 npy_float16; +#endif + +#ifndef NPY_FLOAT96 +typedef long double npy_float96; +#endif + +#ifndef NPY_FLOAT128 +typedef long double npy_float128; +#endif + +int bisect_left_b(npy_int8 *a, long x, int hi, int offset); +int bisect_left_ub(npy_uint8 *a, long x, int hi, int offset); +int bisect_right_b(npy_int8 *a, long x, int hi, int offset); +int bisect_right_ub(npy_uint8 *a, long x, int hi, int offset); + +int bisect_left_s(npy_int16 *a, long x, int hi, int offset); +int bisect_left_us(npy_uint16 *a, long x, int hi, int offset); +int bisect_right_s(npy_int16 *a, long x, int hi, int offset); +int bisect_right_us(npy_uint16 *a, long x, int hi, int offset); + +int bisect_left_i(npy_int32 *a, long x, int hi, int offset); +int bisect_left_ui(npy_uint32 *a, npy_uint32 x, int hi, int offset); +int bisect_right_i(npy_int32 *a, long x, int hi, int offset); +int bisect_right_ui(npy_uint32 *a, npy_uint32 x, int hi, int offset); + +int bisect_left_ll(npy_int64 *a, npy_int64 x, int hi, int offset); +int bisect_left_ull(npy_uint64 *a, npy_uint64 x, int hi, int offset); +int bisect_right_ll(npy_int64 *a, npy_int64 x, int hi, int offset); +int bisect_right_ull(npy_uint64 *a, npy_uint64 x, int hi, int offset); + +int bisect_left_e(npy_float16 *a, npy_float64 x, int hi, int offset); +int bisect_right_e(npy_float16 *a, npy_float64 x, int hi, int offset); + +int bisect_left_f(npy_float32 *a, npy_float64 x, int hi, int offset); +int bisect_right_f(npy_float32 *a, npy_float64 x, int hi, int offset); + +int bisect_left_d(npy_float64 *a, npy_float64 x, int hi, int offset); +int bisect_right_d(npy_float64 *a, npy_float64 x, int hi, int offset); + +int bisect_left_g(npy_longdouble *a, npy_longdouble x, int hi, int offset); +int bisect_right_g(npy_longdouble *a, npy_longdouble x, int hi, int offset); diff --git a/src/tables.h b/src/tables.h new file mode 100644 index 0000000..20d1850 --- /dev/null +++ b/src/tables.h @@ -0,0 +1,8 @@ +typedef enum +{ + Table, + Array, + EArray, + VLArray, + CArray +} TablesType; diff --git a/src/typeconv.c b/src/typeconv.c new file mode 100644 index 0000000..935d9b2 --- /dev/null +++ b/src/typeconv.c @@ -0,0 +1,94 @@ +/*********************************************************************** + * + * License: BSD + * Created: December 21, 2004 + * Author: Ivan Vilata i Balaguer - reverse:net.selidor@ivan + * Modified: + * Function inlining and some castings for 64-bit adressing + * Francesc Alted 2004-12-27 + * + * $Source: /cvsroot/pytables/pytables/src/typeconv.c,v $ + * $Id$ + * + ***********************************************************************/ + +/* Type conversion functions for PyTables types which are stored + * with a different representation between numpy and HDF5. + */ + +#include "typeconv.h" +#include +#include + + +#if (!defined _ISOC99_SOURCE && !defined __USE_ISOC99) +long int lround(double x) +{ + double trunx; + + if (x > 0.0) { + trunx = floor(x); + if (x - trunx >= 0.5) + trunx += 1; + } else { + trunx = ceil(x); + if (trunx - x >= 0.5) + trunx -= 1; + } + + return (long int)(trunx); +} +#endif /* !_ISOC99_SOURCE && !__USE_ISOC99 */ + + +void conv_float64_timeval32(void *base, + unsigned long byteoffset, + unsigned long bytestride, + PY_LONG_LONG nrecords, + unsigned long nelements, + int sense) +{ + PY_LONG_LONG record; + unsigned long element, gapsize; + double *fieldbase; + union { + PY_LONG_LONG i64; + double f64; + } tv; + + assert(bytestride > 0); + assert(nelements > 0); + + /* Byte distance from end of field to beginning of next field. */ + gapsize = bytestride - nelements * sizeof(double); + + fieldbase = (double *)((unsigned char *)(base) + byteoffset); + + for (record = 0; record < nrecords; record++) { + for (element = 0; element < nelements; element++) { + /* Perform an explicit copy of data to avoid errors related to + unaligned memory access on platforms like AMR, etc. + Patch submitted by Julian Taylor */ + double fb; + memcpy(&fb, fieldbase, sizeof(*fieldbase)); + if (sense == 0) { + /* Convert from float64 to timeval32. */ + tv.i64 = (((PY_LONG_LONG)(fb) << 32) + | (lround((fb - (int)(fb)) * 1e+6) & 0x0ffffffff)); + fb = tv.f64; + } else { + /* Convert from timeval32 to float64. */ + tv.f64 = fb; + /* the next computation is 64 bit-platforms aware */ + fb = 1e-6 * (int)tv.i64 + (tv.i64 >> 32); + } + memcpy(fieldbase, &fb, sizeof(*fieldbase)); + fieldbase++; + } + + fieldbase = (double *)((unsigned char *)(fieldbase) + gapsize); + } + + assert(fieldbase == (base + byteoffset + bytestride * nrecords)); +} + diff --git a/src/typeconv.h b/src/typeconv.h new file mode 100644 index 0000000..4f459f4 --- /dev/null +++ b/src/typeconv.h @@ -0,0 +1,37 @@ +/*********************************************************************** + * + * License: BSD + * Created: December 21, 2004 + * Author: Ivan Vilata i Balaguer - reverse:net.selidor@ivan + * + * $Source: /home/ivan/_/programari/pytables/svn/cvs/pytables/pytables/src/typeconv.h,v $ + * $Id$ + * + ***********************************************************************/ + +/* Type conversion functions for PyTables types which are stored + * with a different representation between numpy and HDF5. + */ + +#ifndef __TYPECONV_H__ +#define __TYPECONV_H__ 1 + +#include + +/* Meaning for common arguments: + * * base: pointer to data + * * byteoffset: offset of first field/element into the data + * * bytestride: distance in bytes from a field/record to the next one + * * nrecords: number of fields/records to translate + * * nelements: number of elements in a field/record + * * sense: 0 for numpy -> HDF5, otherwise HDF5 -> numpy + */ + +void conv_float64_timeval32(void *base, + unsigned long byteoffset, + unsigned long bytestride, + PY_LONG_LONG nrecords, + unsigned long nelements, + int sense); + +#endif /* def __TYPECONV_H__ */ diff --git a/src/utils.c b/src/utils.c new file mode 100644 index 0000000..7278abb --- /dev/null +++ b/src/utils.c @@ -0,0 +1,1011 @@ +#include +#include "utils.h" +#include "H5Zlzo.h" /* Import FILTER_LZO */ +#include "H5Zbzip2.h" /* Import FILTER_BZIP2 */ + +#define PyString_FromString PyUnicode_FromString + +/* See https://numpy.org/doc/1.17/reference/c-api.array.html#c.NO_IMPORT_ARRAY */ +#define NO_IMPORT_ARRAY +#include + +#ifndef NPY_COMPLEX192 +typedef npy_cdouble npy_complex192; +#endif + +#ifndef NPY_COMPLEX256 +typedef npy_cdouble npy_complex256; +#endif + +/* ---------------------------------------------------------------- */ + +#ifdef WIN32 +#include + +/* This routine is meant to detect whether a dynamic library can be + loaded on Windows. This is only way to detect its presence without + harming the user. +*/ +int getLibrary(char *libname) { + HINSTANCE hinstLib; + + /* Load the dynamic library */ + hinstLib = LoadLibrary(TEXT(libname)); + + if (hinstLib != NULL) { + /* Free the dynamic library */ + FreeLibrary(hinstLib); + return 0; + } + else { + return -1; + } +} + +#else /* Unix platforms */ +#include + +/* Routine to detect the existance of shared libraries in UNIX. This + has to be checked in MacOSX. However, this is not used right now in + utilsExtension.pyx because UNIX does not complain when trying to + load an extension library that depends on a shared library that it + is not in the system (python raises just the ImportError). */ +int getLibrary(char *libname) { + void *hinstLib; + + /* Load the dynamic library */ + hinstLib = dlopen(libname, RTLD_LAZY); + + if (hinstLib != NULL) { + /* Free the dynamic library */ + dlclose(hinstLib); + return 0; + } + else { + return -1; + } +} + + +#endif /* Win32 */ + +herr_t set_cache_size(hid_t file_id, size_t cache_size) { +#if H5_VERS_MAJOR == 1 && H5_VERS_MINOR >= 7 + /* MSVS2005 chokes on declarations after statements */ + H5AC_cache_config_t config; +#endif /* if H5_VERSION < "1.7" */ + herr_t code; + + code = 0; + +#if H5_VERS_MAJOR == 1 && H5_VERS_MINOR >= 7 + config.version = H5AC__CURR_CACHE_CONFIG_VERSION; + code = H5Fget_mdc_config(file_id, &config); + config.set_initial_size = TRUE; + config.initial_size = cache_size; +/* config.incr_mode = H5C_incr__off; */ +/* config.decr_mode = H5C_decr__off; */ +/* printf("Setting cache size to: %d\n", cache_size); */ + code = H5Fset_mdc_config(file_id, &config); +/* printf("Return code for H5Fset_mdc_config: %d\n", code); */ + +#endif /* if H5_VERSION < "1.7" */ + + return code; + +} + +PyObject *getHDF5VersionInfo(void) { + long binver; + unsigned majnum, minnum, relnum; + char strver[16]; + PyObject *t; + +/* H5get_libversion(&majnum, &minnum, &relnum); */ + majnum = H5_VERS_MAJOR; + minnum = H5_VERS_MINOR; + relnum = H5_VERS_RELEASE; + /* Get a binary number */ + binver = majnum << 16 | minnum << 8 | relnum; + /* A string number */ + if (strcmp(H5_VERS_SUBRELEASE, "")) { + snprintf(strver, 16, "%d.%d.%d-%s", majnum, minnum, relnum, + H5_VERS_SUBRELEASE); + } + else { + snprintf(strver, 16, "%d.%d.%d", majnum, minnum, relnum); + } + + t = PyTuple_New(2); + PyTuple_SetItem(t, 0, PyLong_FromLong(binver)); + PyTuple_SetItem(t, 1, PyString_FromString(strver)); + return t; +} + +/**************************************************************** +** +** createNamesTuple(): Create Python tuple from a string of *char. +** +****************************************************************/ +PyObject *createNamesTuple(char *buffer[], int nelements) +{ + int i; + PyObject *t; + PyObject *str; + + t = PyTuple_New(nelements); + for (i = 0; i < nelements; i++) { + str = PyString_FromString(buffer[i]); + PyTuple_SetItem(t, i, str); + /* PyTuple_SetItem does not need a decref, because it already do this */ +/* Py_DECREF(str); */ + } + return t; +} + +PyObject *createNamesList(char *buffer[], int nelements) +{ + int i; + PyObject *t; + PyObject *str; + + t = PyList_New(nelements); + for (i = 0; i < nelements; i++) { + str = PyString_FromString(buffer[i]); + PyList_SetItem(t, i, str); + /* PyList_SetItem does not need a decref, because it already do this */ +/* Py_DECREF(str); */ + } + return t; +} + +/*------------------------------------------------------------------------- + * Function: get_filter_names + * + * Purpose: Get the filter names for the chunks in a dataset + * + * Return: Success: 0, Failure: -1 + * + * Programmer: Francesc Alted, faltet@pytables.com + * + * Date: December 19, 2003 + * + * Comments: + * + * Modifications: + * + * + *------------------------------------------------------------------------- + */ + +PyObject *get_filter_names( hid_t loc_id, + const char *dset_name) +{ + hid_t dset; + hid_t dcpl; /* dataset creation property list */ + /* hsize_t chsize[64]; /\* chunk size in elements *\/ */ + int i, j; + int nf; /* number of filters */ + unsigned filt_flags; /* filter flags */ + size_t cd_nelmts; /* filter client number of values */ + unsigned cd_values[20]; /* filter client data values */ + char f_name[256]; /* filter name */ + PyObject *filters; + PyObject *filter_values; + + /* Open the dataset. */ + if ( (dset = H5Dopen( loc_id, dset_name, H5P_DEFAULT )) < 0 ) { + goto out; + } + + /* Get the properties container */ + dcpl = H5Dget_create_plist(dset); + /* Collect information about filters on chunked storage */ + if (H5D_CHUNKED==H5Pget_layout(dcpl)) { + filters = PyDict_New(); + if ((nf = H5Pget_nfilters(dcpl))>0) { + for (i=0; itype) { + case H5L_TYPE_SOFT: + case H5L_TYPE_EXTERNAL: + PyList_Append(out_info[2], strname); + break; + case H5L_TYPE_ERROR: /* XXX: check */ + PyList_Append(out_info[3], strname); + break; + case H5L_TYPE_HARD: + /* Get type of the object and check it */ + ret = H5Gget_objinfo(loc_id, name, FALSE, &oinfo); + if (ret < 0) + return -1; + + switch(oinfo.type) { + case H5G_GROUP: + PyList_Append(out_info[0], strname); + break; + case H5G_DATASET: + PyList_Append(out_info[1], strname); + break; + case H5G_TYPE: + ++namedtypes; + break; + case H5G_UNKNOWN: + PyList_Append(out_info[3], strname); + break; + case H5G_LINK: + /* should not happen */ + PyList_Append(out_info[2], strname); + break; + default: + /* should not happen: assume it is an external link */ + PyList_Append(out_info[2], strname); + } + + /* H5Oget_info_by_name seems to have performance issues (see gh-402) + ret = H5Oget_info_by_name(loc_id, name, &oinfo, H5P_DEFAULT); + if (ret < 0) + return -1; + + switch(oinfo.type) { + case H5O_TYPE_GROUP: + PyList_Append(out_info[0], strname); + break; + case H5O_TYPE_DATASET: + PyList_Append(out_info[1], strname); + break; + case H5O_TYPE_NAMED_DATATYPE: + ++namedtypes; + break; + case H5O_TYPE_UNKNOWN: + PyList_Append(out_info[3], strname); + break; + default: + / * should not happen * / + PyList_Append(out_info[3], strname); + } + */ + break; + default: + /* should not happen */ + PyList_Append(out_info[3], strname); + } + Py_DECREF(strname); + + return 0 ; /* Loop until no more objects remain in directory */ +} + +/**************************************************************** +** +** Giterate(): Group iteration routine. +** +****************************************************************/ +PyObject *Giterate(hid_t parent_id, hid_t loc_id, const char *name) { + hsize_t i=0; + PyObject *t, *tgroup, *tleave, *tlink, *tunknown; + PyObject *info[4]; + + info[0] = tgroup = PyList_New(0); + info[1] = tleave = PyList_New(0); + info[2] = tlink = PyList_New(0); + info[3] = tunknown = PyList_New(0); + + /* Iterate over all the childs behind loc_id (parent_id+loc_id). + * NOTE: using H5_INDEX_CRT_ORDER instead of H5_INDEX_NAME causes failures + * in the test suite */ + H5Literate_by_name(parent_id, name, H5_INDEX_NAME, H5_ITER_NATIVE, + &i, litercb, info, H5P_DEFAULT); + + /* Create the tuple with the list of Groups and Datasets */ + t = PyTuple_New(4); + PyTuple_SetItem(t, 0, tgroup); + PyTuple_SetItem(t, 1, tleave); + PyTuple_SetItem(t, 2, tlink); + PyTuple_SetItem(t, 3, tunknown); + + return t; +} + +/**************************************************************** +** +** aitercb(): Custom attribute iteration callback routine. +** +****************************************************************/ +static herr_t aitercb( hid_t loc_id, const char *name, + const H5A_info_t *ainfo, void *op_data) { + PyObject *strname; + + strname = PyString_FromString(name); + /* Return the name of the attribute on op_data */ + PyList_Append(op_data, strname); + Py_DECREF(strname); + return(0); /* Loop until no more attrs remain in object */ +} + + +/**************************************************************** +** +** Aiterate(): Attribute set iteration routine. +** +****************************************************************/ +PyObject *Aiterate(hid_t loc_id) { + hsize_t i = 0; + PyObject *attrlist; /* List where the attrnames are put */ + + attrlist = PyList_New(0); + H5Aiterate(loc_id, H5_INDEX_CRT_ORDER, H5_ITER_NATIVE, &i, + (H5A_operator_t)aitercb, (void *)attrlist); + + return attrlist; +} + + +/**************************************************************** +** +** getHDF5ClassID(): Returns class ID for loc_id.name. -1 if error. +** +****************************************************************/ +H5T_class_t getHDF5ClassID(hid_t loc_id, + const char *name, + H5D_layout_t *layout, + hid_t *type_id, + hid_t *dataset_id) { + H5T_class_t class_id; + hid_t plist; + + /* Open the dataset. */ + if ( (*dataset_id = H5Dopen( loc_id, name, H5P_DEFAULT )) < 0 ) + return -1; + + /* Get an identifier for the datatype. */ + *type_id = H5Dget_type( *dataset_id ); + + /* Get the class. */ + class_id = H5Tget_class( *type_id ); + + /* Get the layout of the datatype */ + plist = H5Dget_create_plist(*dataset_id); + *layout = H5Pget_layout(plist); + H5Pclose(plist); + + return class_id; + +} + + +/* Helper routine that returns the rank, dims and byteorder for + UnImplemented objects. 2004 +*/ + +PyObject *H5UIget_info( hid_t loc_id, + const char *dset_name, + char *byteorder) +{ + hid_t dataset_id; + int rank; + hsize_t *dims; + hid_t space_id; + H5T_class_t class_id; + H5T_order_t order; + hid_t type_id; + PyObject *t; + int i; + + /* Open the dataset. */ + if ( (dataset_id = H5Dopen( loc_id, dset_name, H5P_DEFAULT )) < 0 ) { + Py_INCREF(Py_None); + return Py_None; /* Not chunked, so return None */ + } + + /* Get an identifier for the datatype. */ + type_id = H5Dget_type( dataset_id ); + + /* Get the class. */ + class_id = H5Tget_class( type_id ); + + /* Get the dataspace handle */ + if ( (space_id = H5Dget_space( dataset_id )) < 0 ) + goto out; + + /* Get rank */ + if ( (rank = H5Sget_simple_extent_ndims( space_id )) < 0 ) + goto out; + + /* Book resources for dims */ + dims = (hsize_t *)malloc(rank * sizeof(hsize_t)); + + /* Get dimensions */ + if ( H5Sget_simple_extent_dims( space_id, dims, NULL) < 0 ) + goto out; + + /* Assign the dimensions to a tuple */ + t = PyTuple_New(rank); + for(i=0;i April 2004. + Adapted to support Tables by F. Alted September 2004. +*/ + +/* Test whether the datatype is of class complex + return 1 if it corresponds to our complex class, otherwise 0 */ +/* This may be ultimately confused with nested types with 2 components + called 'r' and 'i' and being floats, but in that case, the user + most probably wanted to keep a complex type, so getting a complex + instead of a nested type should not be a big issue (I hope!) :-/ + F. Alted 2005-05-23 */ +int is_complex(hid_t type_id) { + hid_t class_id, base_type_id; + hid_t class1, class2; + char *colname1, *colname2; + int result = 0; + hsize_t nfields; + + class_id = H5Tget_class(type_id); + if (class_id == H5T_COMPOUND) { + nfields = H5Tget_nmembers(type_id); + if (nfields == 2) { + colname1 = H5Tget_member_name(type_id, 0); + colname2 = H5Tget_member_name(type_id, 1); + if ((strcmp(colname1, "r") == 0) && (strcmp(colname2, "i") == 0)) { + class1 = H5Tget_member_class(type_id, 0); + class2 = H5Tget_member_class(type_id, 1); + if (class1 == H5T_FLOAT && class2 == H5T_FLOAT) + result = 1; + } + pt_H5free_memory(colname1); + pt_H5free_memory(colname2); + } + } + /* Is an Array of Complex? */ + else if (class_id == H5T_ARRAY) { + /* Get the array base component */ + base_type_id = H5Tget_super(type_id); + /* Call is_complex again */ + result = is_complex(base_type_id); + H5Tclose(base_type_id); + } + return result; +} + + +/* Return the byteorder of a complex datatype. + It is obtained from the real part, which is the first member. */ +static H5T_order_t get_complex_order(hid_t type_id) { + hid_t class_id, base_type_id; + hid_t real_type = 0; + H5T_order_t result = 0; + + class_id = H5Tget_class(type_id); + if (class_id == H5T_COMPOUND) { + real_type = H5Tget_member_type(type_id, 0); + } + else if (class_id == H5T_ARRAY) { + /* Get the array base component */ + base_type_id = H5Tget_super(type_id); + /* Get the type of real component. */ + real_type = H5Tget_member_type(base_type_id, 0); + H5Tclose(base_type_id); + } + if ((class_id == H5T_COMPOUND) || (class_id == H5T_ARRAY)) { + result = H5Tget_order(real_type); + H5Tclose(real_type); + } + return result; +} + + +/* Return the byteorder of a HDF5 data type */ +/* This is actually an extension of H5Tget_order to handle complex types */ +herr_t get_order(hid_t type_id, char *byteorder) { + H5T_order_t h5byteorder; + /* + hid_t class_id; + + class_id = H5Tget_class(type_id); + */ + + if (is_complex(type_id)) { + h5byteorder = get_complex_order(type_id); + } + else { + h5byteorder = H5Tget_order(type_id); + } + if (h5byteorder == H5T_ORDER_LE) { + strcpy(byteorder, "little"); + return h5byteorder; + } + else if (h5byteorder == H5T_ORDER_BE ) { + strcpy(byteorder, "big"); + return h5byteorder; + } + else if (h5byteorder == H5T_ORDER_NONE ) { + strcpy(byteorder, "irrelevant"); + return h5byteorder; + } + else { + /* This should never happen! */ + fprintf(stderr, "Error: unsupported byteorder <%d>\n", h5byteorder); + strcpy(byteorder, "unsupported"); + return -1; + } +} + + +/* Set the byteorder of type_id. */ +/* This only works for datatypes that are not Complex. However, + these types should already been created with correct byteorder */ +herr_t set_order(hid_t type_id, const char *byteorder) { + herr_t status=0; + + if (! is_complex(type_id)) { + if (strcmp(byteorder, "little") == 0) + status = H5Tset_order(type_id, H5T_ORDER_LE); + else if (strcmp(byteorder, "big") == 0) + status = H5Tset_order(type_id, H5T_ORDER_BE); + else if (strcmp(byteorder, "irrelevant") == 0) { + /* Do nothing because 'irrelevant' doesn't require setting the + byteorder explicitely */ +/* status = H5Tset_order(type_id, H5T_ORDER_NONE ); */ + } + else { + fprintf(stderr, "Error: unsupported byteorder <%s>\n", byteorder); + status = -1; + } + } + return status; +} + + +/* Create a HDF5 atomic datatype that represents half precision floatting + point numbers defined by numpy as float16. */ +hid_t create_ieee_float16(const char *byteorder) { + hid_t float_id; + + if (byteorder == NULL) + float_id = H5Tcopy(H5T_NATIVE_FLOAT); + else if (strcmp(byteorder, "little") == 0) + float_id = H5Tcopy(H5T_IEEE_F32LE); + else + float_id = H5Tcopy(H5T_IEEE_F32BE); + + if (float_id < 0) + return float_id; + + if (H5Tset_fields(float_id, 15, 10, 5, 0, 10) < 0) + return -1; + + if (H5Tset_size(float_id, 2) < 0) + return -1; + + if (H5Tset_ebias(float_id, 15) < 0) + return -1; + + return float_id; +} + + +/* Create a HDF5 atomic datatype that represents quad precision floatting + point numbers. */ +hid_t create_ieee_quadprecision_float(const char *byteorder) { + hid_t float_id; + + if (byteorder == NULL) + float_id = H5Tcopy(H5T_NATIVE_DOUBLE); + else if (strcmp(byteorder, "little") == 0) + float_id = H5Tcopy(H5T_IEEE_F64LE); + else + float_id = H5Tcopy(H5T_IEEE_F64BE); + + if (float_id < 0) + return float_id; + + if (H5Tset_size(float_id, 16) < 0) + return -1; + + if ((H5Tset_precision(float_id, 128)) < 0) + return -1; + + if (H5Tset_fields(float_id , 127, 112, 15, 0, 112) < 0) + return -1; + + if (H5Tset_ebias(float_id, 16383) < 0) + return -1; + + return float_id; +} + + +/* Create a HDF5 compound datatype that represents complex numbers + defined by numpy as complex64. */ +hid_t create_ieee_complex64(const char *byteorder) { + hid_t float_id, complex_id; + + complex_id = H5Tcreate(H5T_COMPOUND, sizeof(npy_complex64)); + if (byteorder == NULL) + float_id = H5Tcopy(H5T_NATIVE_FLOAT); + else if (strcmp(byteorder, "little") == 0) + float_id = H5Tcopy(H5T_IEEE_F32LE); + else + float_id = H5Tcopy(H5T_IEEE_F32BE); + + if (float_id < 0) + { + H5Tclose(complex_id); + return float_id; + } + + H5Tinsert(complex_id, "r", HOFFSET(npy_complex64, real), float_id); + H5Tinsert(complex_id, "i", HOFFSET(npy_complex64, imag), float_id); + H5Tclose(float_id); + return complex_id; +} + + +/* Counterpart for complex128 */ +hid_t create_ieee_complex128(const char *byteorder) { + hid_t float_id, complex_id; + + complex_id = H5Tcreate(H5T_COMPOUND, sizeof(npy_complex128)); + if (byteorder == NULL) + float_id = H5Tcopy(H5T_NATIVE_DOUBLE); + else if (strcmp(byteorder, "little") == 0) + float_id = H5Tcopy(H5T_IEEE_F64LE); + else + float_id = H5Tcopy(H5T_IEEE_F64BE); + + if (float_id < 0) + { + H5Tclose(complex_id); + return float_id; + } + + H5Tinsert(complex_id, "r", HOFFSET(npy_complex128, real), float_id); + H5Tinsert(complex_id, "i", HOFFSET(npy_complex128, imag), float_id); + H5Tclose(float_id); + return complex_id; +} + + +/* Counterpart for complex192 */ +hid_t create_ieee_complex192(const char *byteorder) { + herr_t err = 0; + hid_t float_id, complex_id; + H5T_order_t h5order = H5Tget_order(H5T_NATIVE_LDOUBLE); + + complex_id = H5Tcreate(H5T_COMPOUND, sizeof(npy_complex192)); + float_id = H5Tcopy(H5T_NATIVE_LDOUBLE); + if (float_id < 0) + { + H5Tclose(complex_id); + return float_id; + } + + if ((strcmp(byteorder, "little") == 0) && (h5order != H5T_ORDER_LE)) + err = H5Tset_order(float_id, H5T_ORDER_LE); + else if ((strcmp(byteorder, "big") == 0) && (h5order != H5T_ORDER_BE)) + err = H5Tset_order(float_id, H5T_ORDER_BE); + + if (err < 0) + { + H5Tclose(complex_id); + return err; + } + + H5Tinsert(complex_id, "r", HOFFSET(npy_complex192, real), float_id); + H5Tinsert(complex_id, "i", HOFFSET(npy_complex192, imag), float_id); + H5Tclose(float_id); + return complex_id; +} + + +/* Counterpart for complex256 */ +hid_t create_ieee_complex256(const char *byteorder) { + herr_t err = 0; + hid_t float_id, complex_id; + H5T_order_t h5order = H5Tget_order(H5T_NATIVE_LDOUBLE); + + complex_id = H5Tcreate(H5T_COMPOUND, sizeof(npy_complex256)); + float_id = H5Tcopy(H5T_NATIVE_LDOUBLE); + if (float_id < 0) + { + H5Tclose(complex_id); + return float_id; + } + + if ((strcmp(byteorder, "little") == 0) && (h5order != H5T_ORDER_LE)) + err = H5Tset_order(float_id, H5T_ORDER_LE); + else if ((strcmp(byteorder, "big") == 0) && (h5order != H5T_ORDER_BE)) + err = H5Tset_order(float_id, H5T_ORDER_BE); + + if (err < 0) + { + H5Tclose(complex_id); + return err; + } + + H5Tinsert(complex_id, "r", HOFFSET(npy_complex256, real), float_id); + H5Tinsert(complex_id, "i", HOFFSET(npy_complex256, imag), float_id); + H5Tclose(float_id); + return complex_id; +} + + +/* Return the number of significant bits in the real and imaginary parts */ +/* This is actually an extension of H5Tget_precision to handle complex types */ +size_t get_complex_precision(hid_t type_id) { + hid_t real_type; + size_t result; + real_type = H5Tget_member_type(type_id, 0); + result = H5Tget_precision(real_type); + H5Tclose(real_type); + return result; +} + +/* End of complex additions */ + + +/* The get_len_of_range has been taken from Python interpreter */ + +/* Return number of items in range/xrange (lo, hi, step). step > 0 + * required. Return a value < 0 if & only if the true value is too + * large to fit in a signed long. + */ +hsize_t get_len_of_range(hsize_t lo, hsize_t hi, hsize_t step) +{ + /* ------------------------------------------------------------- + If lo >= hi, the range is empty. + Else if n values are in the range, the last one is + lo + (n-1)*step, which must be <= hi-1. Rearranging, + n <= (hi - lo - 1)/step + 1, so taking the floor of the RHS gives + the proper value. Since lo < hi in this case, hi-lo-1 >= 0, so + the RHS is non-negative and so truncation is the same as the + floor. Letting M be the largest positive long, the worst case + for the RHS numerator is hi=M, lo=-M-1, and then + hi-lo-1 = M-(-M-1)-1 = 2*M. Therefore unsigned long has enough + precision to compute the RHS exactly. + Note: We are using here 64 bit ints because PyTables can deal + with 64-bit addresses even on 32-bit platforms. + F. Alted 2006-09-25 + ---------------------------------------------------------------*/ + hsize_t n = 0; + if (lo < hi) { + hsize_t diff = hi - lo - 1; + n = (hsize_t)(diff / step + 1); + } + return n; +} + + +/* Truncate the dataset to at most size rows */ +herr_t truncate_dset( hid_t dataset_id, + const int maindim, + const hsize_t size) +{ + + hid_t space_id; + hsize_t *dims = NULL; + int rank; + + /* Get the dataspace handle */ + if ( (space_id = H5Dget_space(dataset_id)) < 0 ) + goto out; + + /* Get the rank */ + if ( (rank = H5Sget_simple_extent_ndims(space_id)) < 0 ) + goto out; + + if (rank) { /* multidimensional case */ + /* Book some memory for the selections */ + dims = (hsize_t *)malloc(rank*sizeof(hsize_t)); + + /* Get dataset dimensionality */ + if ( H5Sget_simple_extent_dims(space_id, dims, NULL) < 0 ) + goto out; + + /* Truncate the EArray */ + dims[maindim] = size; + if ( H5Dset_extent(dataset_id, dims) < 0 ) + goto out; + + /* Release resources */ + free(dims); + } + else { /* scalar case (should never enter here) */ + printf("A scalar Array cannot be truncated!.\n"); + goto out; + } + + /* Free resources */ + if ( H5Sclose(space_id) < 0 ) + return -1; + + return 0; + +out: + if (dims) free(dims); + return -1; +} + + +/* + * Helpers for management of HDF5 drivers + */ + +/* DIRECT driver */ +#ifndef H5_HAVE_DIRECT + +herr_t pt_H5Pset_fapl_direct(hid_t fapl_id, size_t alignment, + size_t block_size, size_t cbuf_size) +{ + return -1; +} + +#endif /* H5_HAVE_DIRECT */ + + +/* WINDOWS driver */ +#ifndef H5_HAVE_WINDOWS + +herr_t pt_H5Pset_fapl_windows(hid_t fapl_id) +{ + return -1; +} + +#endif /* H5_HAVE_WINDOWS */ + + +#if (H5_HAVE_IMAGE_FILE != 1) +/* HDF5 version < 1.8.9 */ + +herr_t pt_H5Pset_file_image(hid_t fapl_id, void *buf_ptr, size_t buf_len) { + return -1; +} + +ssize_t pt_H5Fget_file_image(hid_t file_id, void *buf_ptr, size_t buf_len) { + return -1; +} + +#endif /* (H5_HAVE_IMAGE_FILE != 1) */ + + +#if H5_VERSION_LE(1,8,12) + +herr_t pt_H5free_memory(void *buf) { + free(buf); + return 0; +} + +#endif diff --git a/src/utils.h b/src/utils.h new file mode 100644 index 0000000..d263e06 --- /dev/null +++ b/src/utils.h @@ -0,0 +1,158 @@ +#include +#include "hdf5.h" + +/* Define this variable for error printings */ +/*#define DEBUG 1 */ +/* Define this variable for debugging printings */ +/*#define PRINT 1 */ +/* Define this for compile the main() function */ +/* #define MAIN 1 */ + +/* + * Status return values for the herr_t' type. + * Since some unix/c routines use 0 and -1 (or more precisely, non-negative + * vs. negative) as their return code, and some assumption had been made in + * the code about that, it is important to keep these constants the same + * values. When checking the success or failure of an integer-valued + * function, remember to compare against zero and not one of these two + * values. + */ +#define SUCCEED 0 +#define FAIL (-1) +#define UFAIL (unsigned)(-1) + +/* + * HDF Boolean type. + */ +#ifndef FALSE +# define FALSE 0 +#endif +#ifndef TRUE +# define TRUE (!FALSE) +#endif + +#ifdef H5_HAVE_WINDOWS +#define H5_HAVE_WINDOWS_DRIVER 1 +#else +#define H5_HAVE_WINDOWS_DRIVER 0 +#endif + +#ifdef H5_HAVE_DIRECT +#define H5_HAVE_DIRECT_DRIVER 1 +#else +#define H5_HAVE_DIRECT_DRIVER 0 +#endif + +#if (H5_VERS_MAJOR == 1 && H5_VERS_MINOR == 8 && H5_VERS_RELEASE >= 9) || (H5_VERS_MAJOR == 1 && H5_VERS_MINOR > 8) +/* HDF5 version >= 1.8.9 */ +#define H5_HAVE_IMAGE_FILE 1 +#else +/* HDF5 version < 1.8.9 */ +#define H5_HAVE_IMAGE_FILE 0 +#endif + +/* COMAPTIBILITY: H5_VERSION_LE has been introduced in HDF5 1.8.7 */ +#ifndef H5_VERSION_LE +#define H5_VERSION_LE(Maj,Min,Rel) \ + (((H5_VERS_MAJOR==Maj) && (H5_VERS_MINOR==Min) && (H5_VERS_RELEASE<=Rel)) || \ + ((H5_VERS_MAJOR==Maj) && (H5_VERS_MINOR= 1.8.9 */ +herr_t pt_H5Pset_file_image(hid_t fapl_id, void *buf_ptr, size_t buf_len); +ssize_t pt_H5Fget_file_image(hid_t file_id, void *buf_ptr, size_t buf_len); +#else /* (H5_HAVE_IMAGE_FILE != 1) */ +/* HDF5 version < 1.8.9 */ +#define pt_H5Pset_file_image H5Pset_file_image +#define pt_H5Fget_file_image H5Fget_file_image +#endif /* (H5_HAVE_IMAGE_FILE != 1) */ + + +#if H5_VERSION_LE(1,8,12) +herr_t pt_H5free_memory(void *buf); +#else +#define pt_H5free_memory H5free_memory +#endif diff --git a/tables/__init__.py b/tables/__init__.py new file mode 100644 index 0000000..4a9b755 --- /dev/null +++ b/tables/__init__.py @@ -0,0 +1,149 @@ +"""PyTables, hierarchical datasets in Python. + +:URL: http://www.pytables.org/ + +PyTables is a package for managing hierarchical datasets and designed +to efficiently cope with extremely large amounts of data. + +""" + +# Necessary imports to get versions stored on the cython extension +from .utilsextension import get_hdf5_version as _get_hdf5_version + + +__version__ = "3.7.0" +"""The PyTables version number.""" + +hdf5_version = _get_hdf5_version() +"""The underlying HDF5 library version number. + +.. versionadded:: 3.0 + +""" + +from .utilsextension import ( + blosc_compcode_to_compname_ as blosc_compcode_to_compname, + blosc_get_complib_info_ as blosc_get_complib_info, +) + +from .utilsextension import ( + blosc_compressor_list, is_hdf5_file, is_pytables_file, which_lib_version, + set_blosc_max_threads, silence_hdf5_messages, +) + +from .misc.enum import Enum +from .atom import * +from .flavor import restrict_flavors +from .description import * +from .filters import Filters + +# Import the user classes from the proper modules +from .exceptions import * +from .file import File, open_file, copy_file +from .node import Node +from .group import Group +from .leaf import Leaf +from .table import Table, Cols, Column +from .array import Array +from .carray import CArray +from .earray import EArray +from .vlarray import VLArray +from .unimplemented import UnImplemented, Unknown +from .expression import Expr +from .tests import print_versions, test + + +# List here only the objects we want to be publicly available +__all__ = [ + # Exceptions and warnings: + 'HDF5ExtError', + 'ClosedNodeError', 'ClosedFileError', 'FileModeError', + 'NaturalNameWarning', 'NodeError', 'NoSuchNodeError', + 'UndoRedoError', 'UndoRedoWarning', + 'PerformanceWarning', + 'FlavorError', 'FlavorWarning', + 'FiltersWarning', 'DataTypeWarning', + # Functions: + 'is_hdf5_file', 'is_pytables_file', 'which_lib_version', + 'copy_file', 'open_file', 'print_versions', 'test', + 'split_type', 'restrict_flavors', 'set_blosc_max_threads', + 'silence_hdf5_messages', + # Helper classes: + 'IsDescription', 'Description', 'Filters', 'Cols', 'Column', + # Types: + 'Enum', + # Atom types: + 'Atom', 'StringAtom', 'BoolAtom', + 'IntAtom', 'UIntAtom', 'Int8Atom', 'UInt8Atom', 'Int16Atom', 'UInt16Atom', + 'Int32Atom', 'UInt32Atom', 'Int64Atom', 'UInt64Atom', + 'FloatAtom', 'Float32Atom', 'Float64Atom', + 'ComplexAtom', 'Complex32Atom', 'Complex64Atom', 'Complex128Atom', + 'TimeAtom', 'Time32Atom', 'Time64Atom', + 'EnumAtom', + 'PseudoAtom', 'ObjectAtom', 'VLStringAtom', 'VLUnicodeAtom', + # Column types: + 'Col', 'StringCol', 'BoolCol', + 'IntCol', 'UIntCol', 'Int8Col', 'UInt8Col', 'Int16Col', 'UInt16Col', + 'Int32Col', 'UInt32Col', 'Int64Col', 'UInt64Col', + 'FloatCol', 'Float32Col', 'Float64Col', + 'ComplexCol', 'Complex32Col', 'Complex64Col', 'Complex128Col', + 'TimeCol', 'Time32Col', 'Time64Col', + 'EnumCol', + # Node classes: + 'Node', 'Group', 'Leaf', 'Table', 'Array', 'CArray', 'EArray', 'VLArray', + 'UnImplemented', 'Unknown', + # The File class: + 'File', + # Expr class + 'Expr', +] + +if 'Float16Atom' in locals(): + # float16 is new in numpy 1.6.0 + __all__.extend(('Float16Atom', 'Float16Col')) + + +from .utilsextension import _broken_hdf5_long_double +if not _broken_hdf5_long_double(): + if 'Float96Atom' in locals(): + __all__.extend(('Float96Atom', 'Float96Col')) + __all__.extend(('Complex192Atom', 'Complex192Col')) # XXX check + + if 'Float128Atom' in locals(): + __all__.extend(('Float128Atom', 'Float128Col')) + __all__.extend(('Complex256Atom', 'Complex256Col')) # XXX check + +else: + + from . import atom as _atom + from . import description as _description + try: + del _atom.Float96Atom, _atom.Complex192Col + del _description.Float96Col, _description.Complex192Col + _atom.all_types.discard('complex192') + _atom.ComplexAtom._isizes.remove(24) + except AttributeError: + try: + del _atom.Float128Atom, _atom.Complex256Atom + del _description.Float128Col, _description.Complex256Col + _atom.all_types.discard('complex256') + _atom.ComplexAtom._isizes.remove(32) + except AttributeError: + pass + del _atom, _description +del _broken_hdf5_long_double + + +def get_pytables_version(): + warnings.warn( + "the 'get_pytables_version()' function is deprecated and could be " + "removed in future versions. Please use 'tables.__version__'", + DeprecationWarning) + return __version__ + +def get_hdf5_version(): + warnings.warn( + "the 'get_hdf5_version()' function is deprecated and could be " + "removed in future versions. Please use 'tables.hdf5_version'", + DeprecationWarning) + return hdf5_version \ No newline at end of file diff --git a/tables/_comp_bzip2.pyx b/tables/_comp_bzip2.pyx new file mode 100644 index 0000000..4a3399b --- /dev/null +++ b/tables/_comp_bzip2.pyx @@ -0,0 +1,19 @@ +import sys +from libc.stdlib cimport free + + +cdef extern from "H5Zbzip2.h": + int register_bzip2(char **, char **) + + +def register_(): + cdef char *version + cdef char *date + + if not register_bzip2(&version, &date): + return None + + compinfo = (version, date) + free(version) + free(date) + return compinfo[0].decode('ascii'), compinfo[1].decode('ascii') diff --git a/tables/_comp_lzo.pyx b/tables/_comp_lzo.pyx new file mode 100644 index 0000000..c6af453 --- /dev/null +++ b/tables/_comp_lzo.pyx @@ -0,0 +1,19 @@ +import sys +from libc.stdlib cimport free + + +cdef extern from "H5Zlzo.h": + int register_lzo(char **, char **) + + +def register_(): + cdef char *version + cdef char *date + + if not register_lzo(&version, &date): + return None + + compinfo = (version, date) + free(version) + free(date) + return compinfo[0].decode('ascii'), compinfo[1].decode('ascii') diff --git a/tables/array.py b/tables/array.py new file mode 100644 index 0000000..2b3e651 --- /dev/null +++ b/tables/array.py @@ -0,0 +1,912 @@ +"""Here is defined the Array class.""" + +import operator +import sys +import numpy as np + +from . import hdf5extension +from .filters import Filters +from .flavor import flavor_of, array_as_internal, internal_to_flavor +from .leaf import Leaf +from .utils import (is_idx, convert_to_np_atom2, SizeType, lazyattr, + byteorders, quantize) + + +# default version for ARRAY objects +# obversion = "1.0" # initial version +# obversion = "2.0" # Added an optional EXTDIM attribute +# obversion = "2.1" # Added support for complex datatypes +# obversion = "2.2" # This adds support for time datatypes. +# obversion = "2.3" # This adds support for enumerated datatypes. +obversion = "2.4" # Numeric and numarray flavors are gone. + + +class Array(hdf5extension.Array, Leaf): + """This class represents homogeneous datasets in an HDF5 file. + + This class provides methods to write or read data to or from array objects + in the file. This class does not allow you neither to enlarge nor compress + the datasets on disk; use the EArray class (see :ref:`EArrayClassDescr`) if + you want enlargeable dataset support or compression features, or CArray + (see :ref:`CArrayClassDescr`) if you just want compression. + + An interesting property of the Array class is that it remembers the + *flavor* of the object that has been saved so that if you saved, for + example, a list, you will get a list during readings afterwards; if you + saved a NumPy array, you will get a NumPy object, and so forth. + + Note that this class inherits all the public attributes and methods that + Leaf (see :ref:`LeafClassDescr`) already provides. However, as Array + instances have no internal I/O buffers, it is not necessary to use the + flush() method they inherit from Leaf in order to save their internal state + to disk. When a writing method call returns, all the data is already on + disk. + + Parameters + ---------- + parentnode + The parent :class:`Group` object. + + .. versionchanged:: 3.0 + Renamed from *parentNode* to *parentnode* + + name : str + The name of this node in its parent group. + obj + The array or scalar to be saved. Accepted types are NumPy + arrays and scalars as well as native Python sequences and + scalars, provided that values are regular (i.e. they are not + like ``[[1,2],2]``) and homogeneous (i.e. all the elements are + of the same type). + + .. versionchanged:: 3.0 + Renamed form *object* into *obj*. + title + A description for this node (it sets the ``TITLE`` HDF5 attribute on + disk). + byteorder + The byteorder of the data *on disk*, specified as 'little' or 'big'. + If this is not specified, the byteorder is that of the given `object`. + track_times + Whether time data associated with the leaf are recorded (object + access time, raw data modification time, metadata change time, object + birth time); default True. Semantics of these times depend on their + implementation in the HDF5 library: refer to documentation of the + H5O_info_t data structure. As of HDF5 1.8.15, only ctime (metadata + change time) is implemented. + + .. versionadded:: 3.4.3 + + """ + + # Class identifier. + _c_classid = 'ARRAY' + + @lazyattr + def dtype(self): + """The NumPy ``dtype`` that most closely matches this array.""" + return self.atom.dtype + + @property + def nrows(self): + """The number of rows in the array.""" + if self.shape == (): + return SizeType(1) # scalar case + else: + return self.shape[self.maindim] + + @property + def rowsize(self): + """The size of the rows in bytes in dimensions orthogonal to + *maindim*.""" + maindim = self.maindim + rowsize = self.atom.size + for i, dim in enumerate(self.shape): + if i != maindim: + rowsize *= dim + return rowsize + + @property + def size_in_memory(self): + """The size of this array's data in bytes when it is fully loaded into + memory.""" + return self.nrows * self.rowsize + + def __init__(self, parentnode, name, + obj=None, title="", + byteorder=None, _log=True, _atom=None, + track_times=True): + + self._v_version = None + """The object version of this array.""" + self._v_new = new = obj is not None + """Is this the first time the node has been created?""" + self._v_new_title = title + """New title for this node.""" + self._obj = obj + """The object to be stored in the array. It can be any of numpy, + list, tuple, string, integer of floating point types, provided + that they are regular (i.e. they are not like ``[[1, 2], 2]``). + + .. versionchanged:: 3.0 + Renamed form *_object* into *_obj*. + + """ + + self._v_convert = True + """Whether the ``Array`` object must be converted or not.""" + + # Miscellaneous iteration rubbish. + self._start = None + """Starting row for the current iteration.""" + self._stop = None + """Stopping row for the current iteration.""" + self._step = None + """Step size for the current iteration.""" + self._nrowsread = None + """Number of rows read up to the current state of iteration.""" + self._startb = None + """Starting row for current buffer.""" + self._stopb = None + """Stopping row for current buffer. """ + self._row = None + """Current row in iterators (sentinel).""" + self._init = False + """Whether we are in the middle of an iteration or not (sentinel).""" + self.listarr = None + """Current buffer in iterators.""" + + # Documented (*public*) attributes. + self.atom = _atom + """An Atom (see :ref:`AtomClassDescr`) instance representing the *type* + and *shape* of the atomic objects to be saved. + """ + self.shape = None + """The shape of the stored array.""" + self.nrow = None + """On iterators, this is the index of the current row.""" + self.extdim = -1 # ordinary arrays are not enlargeable + """The index of the enlargeable dimension.""" + + # Ordinary arrays have no filters: leaf is created with default ones. + super().__init__(parentnode, name, new, Filters(), byteorder, _log, + track_times) + + def _g_create(self): + """Save a new array in file.""" + + self._v_version = obversion + try: + # `Leaf._g_post_init_hook()` should be setting the flavor on disk. + self._flavor = flavor = flavor_of(self._obj) + nparr = array_as_internal(self._obj, flavor) + except Exception: # XXX + # Problems converting data. Close the node and re-raise exception. + self.close(flush=0) + raise + + # Raise an error in case of unsupported object + if nparr.dtype.kind in ['V', 'U', 'O']: # in void, unicode, object + raise TypeError("Array objects cannot currently deal with void, " + "unicode or object arrays") + + # Decrease the number of references to the object + self._obj = None + + # Fix the byteorder of data + nparr = self._g_fix_byteorder_data(nparr, nparr.dtype.byteorder) + + # Create the array on-disk + try: + # ``self._v_objectid`` needs to be set because would be + # needed for setting attributes in some descendants later + # on + (self._v_objectid, self.shape, self.atom) = self._create_array( + nparr, self._v_new_title, self.atom) + except Exception: # XXX + # Problems creating the Array on disk. Close node and re-raise. + self.close(flush=0) + raise + + # Compute the optimal buffer size + self.nrowsinbuf = self._calc_nrowsinbuf() + # Arrays don't have chunkshapes (so, set it to None) + self._v_chunkshape = None + + return self._v_objectid + + def _g_open(self): + """Get the metadata info for an array in file.""" + + (oid, self.atom, self.shape, self._v_chunkshape) = self._open_array() + + self.nrowsinbuf = self._calc_nrowsinbuf() + + return oid + + def get_enum(self): + """Get the enumerated type associated with this array. + + If this array is of an enumerated type, the corresponding Enum instance + (see :ref:`EnumClassDescr`) is returned. If it is not of an enumerated + type, a TypeError is raised. + + """ + + if self.atom.kind != 'enum': + raise TypeError("array ``%s`` is not of an enumerated type" + % self._v_pathname) + + return self.atom.enum + + def iterrows(self, start=None, stop=None, step=None): + """Iterate over the rows of the array. + + This method returns an iterator yielding an object of the current + flavor for each selected row in the array. The returned rows are taken + from the *main dimension*. + + If a range is not supplied, *all the rows* in the array are iterated + upon - you can also use the :meth:`Array.__iter__` special method for + that purpose. If you only want to iterate over a given *range of rows* + in the array, you may use the start, stop and step parameters. + + Examples + -------- + + :: + + result = [row for row in arrayInstance.iterrows(step=4)] + + .. versionchanged:: 3.0 + If the *start* parameter is provided and *stop* is None then the + array is iterated from *start* to the last line. + In PyTables < 3.0 only one element was returned. + + """ + + try: + (self._start, self._stop, self._step) = self._process_range( + start, stop, step) + except IndexError: + # If problems with indexes, silently return the null tuple + return () + self._init_loop() + return self + + def __iter__(self): + """Iterate over the rows of the array. + + This is equivalent to calling :meth:`Array.iterrows` with default + arguments, i.e. it iterates over *all the rows* in the array. + + Examples + -------- + + :: + + result = [row[2] for row in array] + + Which is equivalent to:: + + result = [row[2] for row in array.iterrows()] + + """ + + if not self._init: + # If the iterator is called directly, assign default variables + self._start = 0 + self._stop = self.nrows + self._step = 1 + # and initialize the loop + self._init_loop() + return self + + def _init_loop(self): + """Initialization for the __iter__ iterator.""" + + self._nrowsread = self._start + self._startb = self._start + self._row = -1 # Sentinel + self._init = True # Sentinel + self.nrow = SizeType(self._start - self._step) # row number + + def __next__(self): + """Get the next element of the array during an iteration. + + The element is returned as an object of the current flavor. + + """ + + # this could probably be sped up for long iterations by reusing the + # listarr buffer + if self._nrowsread >= self._stop: + self._init = False + self.listarr = None # fixes issue #308 + raise StopIteration # end of iteration + else: + # Read a chunk of rows + if self._row + 1 >= self.nrowsinbuf or self._row < 0: + self._stopb = self._startb + self._step * self.nrowsinbuf + # Protection for reading more elements than needed + if self._stopb > self._stop: + self._stopb = self._stop + listarr = self._read(self._startb, self._stopb, self._step) + # Swap the axes to easy the return of elements + if self.extdim > 0: + listarr = listarr.swapaxes(self.extdim, 0) + self.listarr = internal_to_flavor(listarr, self.flavor) + self._row = -1 + self._startb = self._stopb + self._row += 1 + self.nrow += self._step + self._nrowsread += self._step + # Fixes bug #968132 + # if self.listarr.shape: + if self.shape: + return self.listarr[self._row] + else: + return self.listarr # Scalar case + + def _interpret_indexing(self, keys): + """Internal routine used by __getitem__ and __setitem__""" + + maxlen = len(self.shape) + shape = (maxlen,) + startl = np.empty(shape=shape, dtype=SizeType) + stopl = np.empty(shape=shape, dtype=SizeType) + stepl = np.empty(shape=shape, dtype=SizeType) + stop_None = np.zeros(shape=shape, dtype=SizeType) + if not isinstance(keys, tuple): + keys = (keys,) + nkeys = len(keys) + dim = 0 + # Here is some problem when dealing with [...,...] params + # but this is a bit weird way to pass parameters anyway + for key in keys: + ellipsis = 0 # Sentinel + if isinstance(key, type(Ellipsis)): + ellipsis = 1 + for diml in range(dim, len(self.shape) - (nkeys - dim) + 1): + startl[dim] = 0 + stopl[dim] = self.shape[diml] + stepl[dim] = 1 + dim += 1 + elif dim >= maxlen: + raise IndexError("Too many indices for object '%s'" % + self._v_pathname) + elif is_idx(key): + key = operator.index(key) + + # Protection for index out of range + if key >= self.shape[dim]: + raise IndexError("Index out of range") + if key < 0: + # To support negative values (Fixes bug #968149) + key += self.shape[dim] + start, stop, step = self._process_range( + key, key + 1, 1, dim=dim) + stop_None[dim] = 1 + elif isinstance(key, slice): + start, stop, step = self._process_range( + key.start, key.stop, key.step, dim=dim) + else: + raise TypeError("Non-valid index or slice: %s" % key) + if not ellipsis: + startl[dim] = start + stopl[dim] = stop + stepl[dim] = step + dim += 1 + + # Complete the other dimensions, if needed + if dim < len(self.shape): + for diml in range(dim, len(self.shape)): + startl[dim] = 0 + stopl[dim] = self.shape[diml] + stepl[dim] = 1 + dim += 1 + + # Compute the shape for the container properly. Fixes #1288792 + shape = [] + for dim in range(len(self.shape)): + new_dim = len(range(startl[dim], stopl[dim], stepl[dim])) + if not (new_dim == 1 and stop_None[dim]): + shape.append(new_dim) + + return startl, stopl, stepl, shape + + def _fancy_selection(self, args): + """Performs a NumPy-style fancy selection in `self`. + + Implements advanced NumPy-style selection operations in + addition to the standard slice-and-int behavior. + + Indexing arguments may be ints, slices or lists of indices. + + Note: This is a backport from the h5py project. + + """ + + # Internal functions + + def validate_number(num, length): + """Validate a list member for the given axis length.""" + + try: + num = int(num) + except TypeError: + raise TypeError("Illegal index: %r" % num) + if num > length - 1: + raise IndexError("Index out of bounds: %d" % num) + + def expand_ellipsis(args, rank): + """Expand ellipsis objects and fill in missing axes.""" + + n_el = sum(1 for arg in args if arg is Ellipsis) + if n_el > 1: + raise IndexError("Only one ellipsis may be used.") + elif n_el == 0 and len(args) != rank: + args = args + (Ellipsis,) + + final_args = [] + n_args = len(args) + for idx, arg in enumerate(args): + if arg is Ellipsis: + final_args.extend((slice(None),) * (rank - n_args + 1)) + else: + final_args.append(arg) + + if len(final_args) > rank: + raise IndexError("Too many indices.") + + return final_args + + def translate_slice(exp, length): + """Given a slice object, return a 3-tuple (start, count, step) + + This is for for use with the hyperslab selection routines. + + """ + + start, stop, step = exp.start, exp.stop, exp.step + if start is None: + start = 0 + else: + start = int(start) + if stop is None: + stop = length + else: + stop = int(stop) + if step is None: + step = 1 + else: + step = int(step) + + if step < 1: + raise IndexError("Step must be >= 1 (got %d)" % step) + if stop == start: + raise IndexError("Zero-length selections are not allowed") + if stop < start: + raise IndexError("Reverse-order selections are not allowed") + if start < 0: + start = length + start + if stop < 0: + stop = length + stop + + if not 0 <= start <= (length - 1): + raise IndexError( + "Start index %s out of range (0-%d)" % (start, length - 1)) + if not 1 <= stop <= length: + raise IndexError( + "Stop index %s out of range (1-%d)" % (stop, length)) + + count = (stop - start) // step + if (stop - start) % step != 0: + count += 1 + + if start + count > length: + raise IndexError( + "Selection out of bounds (%d; axis has %d)" % + (start + count, length)) + + return start, count, step + + # Main code for _fancy_selection + mshape = [] + selection = [] + + if not isinstance(args, tuple): + args = (args,) + + args = expand_ellipsis(args, len(self.shape)) + + list_seen = False + reorder = None + for idx, (exp, length) in enumerate(zip(args, self.shape)): + if isinstance(exp, slice): + start, count, step = translate_slice(exp, length) + selection.append((start, count, step, idx, "AND")) + mshape.append(count) + else: + try: + exp = list(exp) + except TypeError: + exp = [exp] # Handle scalar index as a list of length 1 + mshape.append(0) # Keep track of scalar index for NumPy + else: + mshape.append(len(exp)) + if len(exp) == 0: + raise IndexError( + "Empty selections are not allowed (axis %d)" % idx) + elif len(exp) > 1: + if list_seen: + raise IndexError("Only one selection list is allowed") + else: + list_seen = True + else: + if (not isinstance(exp[0], (int, np.integer)) or + (isinstance(exp[0], np.ndarray) and not + np.issubdtype(exp[0].dtype, np.integer))): + raise TypeError("Only integer coordinates allowed.") + + nexp = np.asarray(exp, dtype="i8") + # Convert negative values + nexp = np.where(nexp < 0, length + nexp, nexp) + # Check whether the list is ordered or not + # (only one unordered list is allowed) + if len(nexp) != len(np.unique(nexp)): + raise IndexError( + "Selection lists cannot have repeated values") + neworder = nexp.argsort() + if (neworder.shape != (len(exp),) or + np.sum(np.abs(neworder - np.arange(len(exp)))) != 0): + if reorder is not None: + raise IndexError( + "Only one selection list can be unordered") + corrected_idx = sum(1 for x in mshape if x != 0) - 1 + reorder = (corrected_idx, neworder) + nexp = nexp[neworder] + for select_idx in range(len(nexp) + 1): + # This crazy piece of code performs a list selection + # using HDF5 hyperslabs. + # For each index, perform a "NOTB" selection on every + # portion of *this axis* which falls *outside* the list + # selection. For this to work, the input array MUST be + # monotonically increasing. + if select_idx < len(nexp): + validate_number(nexp[select_idx], length) + if select_idx == 0: + start = 0 + count = nexp[0] + elif select_idx == len(nexp): + start = nexp[-1] + 1 + count = length - start + else: + start = nexp[select_idx - 1] + 1 + count = nexp[select_idx] - start + if count > 0: + selection.append((start, count, 1, idx, "NOTB")) + + mshape = tuple(x for x in mshape if x != 0) + return selection, reorder, mshape + + def __getitem__(self, key): + """Get a row, a range of rows or a slice from the array. + + The set of tokens allowed for the key is the same as that for extended + slicing in Python (including the Ellipsis or ... token). The result is + an object of the current flavor; its shape depends on the kind of slice + used as key and the shape of the array itself. + + Furthermore, NumPy-style fancy indexing, where a list of indices in a + certain axis is specified, is also supported. Note that only one list + per selection is supported right now. Finally, NumPy-style point and + boolean selections are supported as well. + + Examples + -------- + + :: + + array1 = array[4] # simple selection + array2 = array[4:1000:2] # slice selection + array3 = array[1, ..., ::2, 1:4, 4:] # general slice selection + array4 = array[1, [1,5,10], ..., -1] # fancy selection + array5 = array[np.where(array[:] > 4)] # point selection + array6 = array[array[:] > 4] # boolean selection + + """ + + self._g_check_open() + + try: + # First, try with a regular selection + startl, stopl, stepl, shape = self._interpret_indexing(key) + arr = self._read_slice(startl, stopl, stepl, shape) + except TypeError: + # Then, try with a point-wise selection + try: + coords = self._point_selection(key) + arr = self._read_coords(coords) + except TypeError: + # Finally, try with a fancy selection + selection, reorder, shape = self._fancy_selection(key) + arr = self._read_selection(selection, reorder, shape) + + if self.flavor == "numpy" or not self._v_convert: + return arr + + return internal_to_flavor(arr, self.flavor) + + def __setitem__(self, key, value): + """Set a row, a range of rows or a slice in the array. + + It takes different actions depending on the type of the key parameter: + if it is an integer, the corresponding array row is set to value (the + value is broadcast when needed). If key is a slice, the row slice + determined by it is set to value (as usual, if the slice to be updated + exceeds the actual shape of the array, only the values in the existing + range are updated). + + If value is a multidimensional object, then its shape must be + compatible with the shape determined by key, otherwise, a ValueError + will be raised. + + Furthermore, NumPy-style fancy indexing, where a list of indices in a + certain axis is specified, is also supported. Note that only one list + per selection is supported right now. Finally, NumPy-style point and + boolean selections are supported as well. + + Examples + -------- + + :: + + a1[0] = 333 # assign an integer to a Integer Array row + a2[0] = 'b' # assign a string to a string Array row + a3[1:4] = 5 # broadcast 5 to slice 1:4 + a4[1:4:2] = 'xXx' # broadcast 'xXx' to slice 1:4:2 + + # General slice update (a5.shape = (4,3,2,8,5,10). + a5[1, ..., ::2, 1:4, 4:] = numpy.arange(1728, shape=(4,3,2,4,3,6)) + a6[1, [1,5,10], ..., -1] = arr # fancy selection + a7[np.where(a6[:] > 4)] = 4 # point selection + broadcast + a8[arr > 4] = arr2 # boolean selection + + """ + + self._g_check_open() + + # Create an array compliant with the specified slice + nparr = convert_to_np_atom2(value, self.atom) + if nparr.size == 0: + return + + # truncate data if least_significant_digit filter is set + # TODO: add the least_significant_digit attribute to the array on disk + if (self.filters.least_significant_digit is not None and + not np.issubdtype(nparr.dtype, np.signedinteger)): + nparr = quantize(nparr, self.filters.least_significant_digit) + + try: + startl, stopl, stepl, shape = self._interpret_indexing(key) + self._write_slice(startl, stopl, stepl, shape, nparr) + except TypeError: + # Then, try with a point-wise selection + try: + coords = self._point_selection(key) + self._write_coords(coords, nparr) + except TypeError: + selection, reorder, shape = self._fancy_selection(key) + self._write_selection(selection, reorder, shape, nparr) + + def _check_shape(self, nparr, slice_shape): + """Test that nparr shape is consistent with underlying object. + + If not, try creating a new nparr object, using broadcasting if + necessary. + + """ + + if nparr.shape != (slice_shape + self.atom.dtype.shape): + # Create an array compliant with the specified shape + narr = np.empty(shape=slice_shape, dtype=self.atom.dtype) + + # Assign the value to it. It will raise a ValueError exception + # if the objects cannot be broadcast to a single shape. + narr[...] = nparr + return narr + else: + return nparr + + def _read_slice(self, startl, stopl, stepl, shape): + """Read a slice based on `startl`, `stopl` and `stepl`.""" + + nparr = np.empty(dtype=self.atom.dtype, shape=shape) + # Protection against reading empty arrays + if 0 not in shape: + # Arrays that have non-zero dimensionality + self._g_read_slice(startl, stopl, stepl, nparr) + # For zero-shaped arrays, return the scalar + if nparr.shape == (): + nparr = nparr[()] + return nparr + + def _read_coords(self, coords): + """Read a set of points defined by `coords`.""" + + nparr = np.empty(dtype=self.atom.dtype, shape=len(coords)) + if len(coords) > 0: + self._g_read_coords(coords, nparr) + # For zero-shaped arrays, return the scalar + if nparr.shape == (): + nparr = nparr[()] + return nparr + + def _read_selection(self, selection, reorder, shape): + """Read a `selection`. + + Reorder if necessary. + + """ + + # Create the container for the slice + nparr = np.empty(dtype=self.atom.dtype, shape=shape) + # Arrays that have non-zero dimensionality + self._g_read_selection(selection, nparr) + # For zero-shaped arrays, return the scalar + if nparr.shape == (): + nparr = nparr[()] + elif reorder is not None: + # We need to reorder the array + idx, neworder = reorder + k = [slice(None)] * len(shape) + k[idx] = neworder.argsort() + # Apparently, a copy is not needed here, but doing it + # for symmetry with the `_write_selection()` method. + nparr = nparr[tuple(k)].copy() + return nparr + + def _write_slice(self, startl, stopl, stepl, shape, nparr): + """Write `nparr` in a slice based on `startl`, `stopl` and `stepl`.""" + + nparr = self._check_shape(nparr, tuple(shape)) + countl = ((stopl - startl - 1) // stepl) + 1 + self._g_write_slice(startl, stepl, countl, nparr) + + def _write_coords(self, coords, nparr): + """Write `nparr` values in points defined by `coords` coordinates.""" + + if len(coords) > 0: + nparr = self._check_shape(nparr, (len(coords),)) + self._g_write_coords(coords, nparr) + + def _write_selection(self, selection, reorder, shape, nparr): + """Write `nparr` in `selection`. + + Reorder if necessary. + + """ + + nparr = self._check_shape(nparr, tuple(shape)) + # Check whether we should reorder the array + if reorder is not None: + idx, neworder = reorder + k = [slice(None)] * len(shape) + k[idx] = neworder + # For a reason a don't understand well, we need a copy of + # the reordered array + nparr = nparr[tuple(k)].copy() + self._g_write_selection(selection, nparr) + + def _read(self, start, stop, step, out=None): + """Read the array from disk without slice or flavor processing.""" + + nrowstoread = len(range(start, stop, step)) + shape = list(self.shape) + if shape: + shape[self.maindim] = nrowstoread + if out is None: + arr = np.empty(dtype=self.atom.dtype, shape=shape) + else: + bytes_required = self.rowsize * nrowstoread + # if buffer is too small, it will segfault + if bytes_required != out.nbytes: + raise ValueError(f'output array size invalid, got {out.nbytes}' + f' bytes, need {bytes_required} bytes') + if not out.flags['C_CONTIGUOUS']: + raise ValueError('output array not C contiguous') + arr = out + # Protection against reading empty arrays + if 0 not in shape: + # Arrays that have non-zero dimensionality + self._read_array(start, stop, step, arr) + # data is always read in the system byteorder + # if the out array's byteorder is different, do a byteswap + if (out is not None and + byteorders[arr.dtype.byteorder] != sys.byteorder): + arr.byteswap(True) + return arr + + def read(self, start=None, stop=None, step=None, out=None): + """Get data in the array as an object of the current flavor. + + The start, stop and step parameters can be used to select only a + *range of rows* in the array. Their meanings are the same as in + the built-in range() Python function, except that negative values + of step are not allowed yet. Moreover, if only start is specified, + then stop will be set to start + 1. If you do not specify neither + start nor stop, then *all the rows* in the array are selected. + + The out parameter may be used to specify a NumPy array to receive + the output data. Note that the array must have the same size as + the data selected with the other parameters. Note that the array's + datatype is not checked and no type casting is performed, so if it + does not match the datatype on disk, the output will not be correct. + Also, this parameter is only valid when the array's flavor is set + to 'numpy'. Otherwise, a TypeError will be raised. + + When data is read from disk in NumPy format, the output will be + in the current system's byteorder, regardless of how it is stored + on disk. + The exception is when an output buffer is supplied, in which case + the output will be in the byteorder of that output buffer. + + .. versionchanged:: 3.0 + Added the *out* parameter. + + """ + + self._g_check_open() + if out is not None and self.flavor != 'numpy': + msg = ("Optional 'out' argument may only be supplied if array " + "flavor is 'numpy', currently is {}").format(self.flavor) + raise TypeError(msg) + (start, stop, step) = self._process_range_read(start, stop, step) + arr = self._read(start, stop, step, out) + return internal_to_flavor(arr, self.flavor) + + def _g_copy_with_stats(self, group, name, start, stop, step, + title, filters, chunkshape, _log, **kwargs): + """Private part of Leaf.copy() for each kind of leaf.""" + + # Compute the correct indices. + (start, stop, step) = self._process_range_read(start, stop, step) + # Get the slice of the array + # (non-buffered version) + if self.shape: + arr = self[start:stop:step] + else: + arr = self[()] + # Build the new Array object. Use the _atom reserved keyword + # just in case the array is being copied from a native HDF5 + # with atomic types different from scalars. + # For details, see #275 of trac. + object_ = Array(group, name, arr, title=title, _log=_log, + _atom=self.atom) + nbytes = np.prod(self.shape, dtype=SizeType) * self.atom.size + + return (object_, nbytes) + + def __repr__(self): + """This provides more metainfo in addition to standard __str__""" + + return f"""{self} + atom := {self.atom!r} + maindim := {self.maindim!r} + flavor := {self.flavor!r} + byteorder := {self.byteorder!r} + chunkshape := {self.chunkshape!r}""" + + +class ImageArray(Array): + """Array containing an image. + + This class has no additional behaviour or functionality compared to + that of an ordinary array. It simply enables the user to open an + ``IMAGE`` HDF5 node as a normal `Array` node in PyTables. + + """ + + # Class identifier. + _c_classid = 'IMAGE' diff --git a/tables/atom.py b/tables/atom.py new file mode 100644 index 0000000..ade3292 --- /dev/null +++ b/tables/atom.py @@ -0,0 +1,1175 @@ +"""Atom classes for describing dataset contents.""" + +import re +import inspect +import warnings + +import numpy as np + +from .utils import SizeType +from .misc.enum import Enum + +import pickle + +from .exceptions import FlavorWarning + +__docformat__ = 'reStructuredText' +"""The format of documentation strings in this module.""" + +all_types = set() # filled as atom classes are created +"""Set of all PyTables types.""" + +atom_map = {} # filled as atom classes are created +"""Maps atom kinds to item sizes and atom classes. + +If there is a fixed set of possible item sizes for a given kind, the +kind maps to another mapping from item size in bytes to atom class. +Otherwise, the kind maps directly to the atom class. +""" + +deftype_from_kind = {} # filled as atom classes are created +"""Maps atom kinds to their default atom type (if any).""" + + +_type_re = re.compile(r'^([a-z]+)([0-9]*)$') + + +def split_type(type): + """Split a PyTables type into a PyTables kind and an item size. + + Returns a tuple of (kind, itemsize). If no item size is present in the type + (in the form of a precision), the returned item size is None:: + + >>> split_type('int32') + ('int', 4) + >>> split_type('string') + ('string', None) + >>> split_type('int20') + Traceback (most recent call last): + ... + ValueError: precision must be a multiple of 8: 20 + >>> split_type('foo bar') + Traceback (most recent call last): + ... + ValueError: malformed type: 'foo bar' + + """ + + match = _type_re.match(type) + if not match: + raise ValueError("malformed type: %r" % type) + kind, precision = match.groups() + itemsize = None + if precision: + precision = int(precision) + itemsize, remainder = divmod(precision, 8) + if remainder: # 0 could be a valid item size + raise ValueError("precision must be a multiple of 8: %d" + % precision) + return (kind, itemsize) + + +def _invalid_itemsize_error(kind, itemsize, itemsizes): + isizes = sorted(itemsizes) + return ValueError("invalid item size for kind ``%s``: %r; " + "it must be one of ``%r``" + % (kind, itemsize, isizes)) + + +def _abstract_atom_init(deftype, defvalue): + """Return a constructor for an abstract `Atom` class.""" + + defitemsize = split_type(deftype)[1] + + def __init__(self, itemsize=defitemsize, shape=(), dflt=defvalue): + assert self.kind in atom_map + try: + atomclass = atom_map[self.kind][itemsize] + except KeyError: + raise _invalid_itemsize_error(self.kind, itemsize, + atom_map[self.kind]) + self.__class__ = atomclass + atomclass.__init__(self, shape, dflt) + return __init__ + + +def _normalize_shape(shape): + """Check that the `shape` is safe to be used and return it as a tuple.""" + + if isinstance(shape, (np.integer, int)): + if shape < 1: + raise ValueError("shape value must be greater than 0: %d" + % shape) + shape = (shape,) # N is a shorthand for (N,) + try: + shape = tuple(shape) + except TypeError: + raise TypeError("shape must be an integer or sequence: %r" + % (shape,)) + + # XXX Get from HDF5 library if possible. + # HDF5 does not support ranks greater than 32 + if len(shape) > 32: + raise ValueError( + f"shapes with rank > 32 are not supported: {shape!r}") + + return tuple(SizeType(s) for s in shape) + + +def _normalize_default(value, dtype): + """Return `value` as a valid default of NumPy type `dtype`.""" + + # Create NumPy objects as defaults + # This is better in order to serialize them as attributes + if value is None: + value = 0 + basedtype = dtype.base + try: + default = np.array(value, dtype=basedtype) + except ValueError: + array = np.array(value) + if array.shape != basedtype.shape: + raise + # Maybe nested dtype with "scalar" value. + default = np.array(value, dtype=basedtype.base) + # 0-dim arrays will be representented as NumPy scalars + # (PyTables attribute convention) + if default.shape == (): + default = default[()] + return default + + +def _cmp_dispatcher(other_method_name): + """Dispatch comparisons to a method of the *other* object. + + Returns a new *rich comparison* method which dispatches calls to + the method `other_method_name` of the *other* object. If there is + no such method in the object, ``False`` is returned. + + This is part of the implementation of a double dispatch pattern. + """ + + def dispatched_cmp(self, other): + try: + other_method = getattr(other, other_method_name) + except AttributeError: + return False + return other_method(self) + return dispatched_cmp + + +class MetaAtom(type): + """Atom metaclass. + + This metaclass ensures that data about atom classes gets inserted + into the suitable registries. + + """ + + def __init__(cls, name, bases, dict_): + super().__init__(name, bases, dict_) + + kind = dict_.get('kind') + itemsize = dict_.get('itemsize') + type_ = dict_.get('type') + deftype = dict_.get('_deftype') + + if kind and deftype: + deftype_from_kind[kind] = deftype + + if type_: + all_types.add(type_) + + if kind and itemsize and not hasattr(itemsize, '__int__'): + # Atom classes with a non-fixed item size do have an + # ``itemsize``, but it's not a number (e.g. property). + atom_map[kind] = cls + return + + if kind: # first definition of kind, make new entry + atom_map[kind] = {} + + if itemsize and hasattr(itemsize, '__int__'): # fixed + kind = cls.kind # maybe from superclasses + atom_map[kind][int(itemsize)] = cls + + +class Atom(metaclass=MetaAtom): + """Defines the type of atomic cells stored in a dataset. + + The meaning of *atomic* is that individual elements of a cell can + not be extracted directly by indexing (i.e. __getitem__()) the + dataset; e.g. if a dataset has shape (2, 2) and its atoms have + shape (3,), to get the third element of the cell at (1, 0) one + should use dataset[1,0][2] instead of dataset[1,0,2]. + + The Atom class is meant to declare the different properties of the + *base element* (also known as *atom*) of CArray, EArray and + VLArray datasets, although they are also used to describe the base + elements of Array datasets. Atoms have the property that their + length is always the same. However, you can grow datasets along + the extensible dimension in the case of EArray or put a variable + number of them on a VLArray row. Moreover, they are not restricted + to scalar values, and they can be *fully multidimensional + objects*. + + Parameters + ---------- + itemsize : int + For types with a non-fixed size, this sets the size in + bytes of individual items in the atom. + shape : tuple + Sets the shape of the atom. An integer shape of + N is equivalent to the tuple (N,). + dflt + Sets the default value for the atom. + + The following are the public methods and attributes of the Atom class. + + Notes + ----- + A series of descendant classes are offered in order to make the + use of these element descriptions easier. You should use a + particular Atom descendant class whenever you know the exact type + you will need when writing your code. Otherwise, you may use one + of the Atom.from_*() factory Methods. + + .. rubric:: Atom attributes + + .. attribute:: dflt + + The default value of the atom. + + If the user does not supply a value for an element while + filling a dataset, this default value will be written to disk. + If the user supplies a scalar value for a multidimensional + atom, this value is automatically *broadcast* to all the items + in the atom cell. If dflt is not supplied, an appropriate zero + value (or *null* string) will be chosen by default. Please + note that default values are kept internally as NumPy objects. + + .. attribute:: dtype + + The NumPy dtype that most closely matches this atom. + + .. attribute:: itemsize + + Size in bytes of a single item in the atom. + Specially useful for atoms of the string kind. + + .. attribute:: kind + + The PyTables kind of the atom (a string). + + .. attribute:: shape + + The shape of the atom (a tuple for scalar atoms). + + .. attribute:: type + + The PyTables type of the atom (a string). + + Atoms can be compared with atoms and other objects for + strict (in)equality without having to compare individual + attributes:: + + >>> atom1 = StringAtom(itemsize=10) # same as ``atom2`` + >>> atom2 = Atom.from_kind('string', 10) # same as ``atom1`` + >>> atom3 = IntAtom() + >>> atom1 == 'foo' + False + >>> atom1 == atom2 + True + >>> atom2 != atom1 + False + >>> atom1 == atom3 + False + >>> atom3 != atom2 + True + + """ + + @classmethod + def prefix(cls): + """Return the atom class prefix.""" + cname = cls.__name__ + return cname[:cname.rfind('Atom')] + + @classmethod + def from_sctype(cls, sctype, shape=(), dflt=None): + """Create an Atom from a NumPy scalar type sctype. + + Optional shape and default value may be specified as the + shape and dflt + arguments, respectively. Information in the + sctype not represented in an Atom is ignored:: + + >>> import numpy as np + >>> Atom.from_sctype(np.int16, shape=(2, 2)) + Int16Atom(shape=(2, 2), dflt=0) + >>> Atom.from_sctype('S5', dflt='hello') + Traceback (most recent call last): + ... + ValueError: unknown NumPy scalar type: 'S5' + >>> Atom.from_sctype('float64') + Float64Atom(shape=(), dflt=0.0) + + """ + if (not isinstance(sctype, type) + or not issubclass(sctype, np.generic)): + if sctype not in np.sctypeDict: + raise ValueError(f"unknown NumPy scalar type: {sctype!r}") + sctype = np.sctypeDict[sctype] + return cls.from_dtype(np.dtype((sctype, shape)), dflt) + + @classmethod + def from_dtype(cls, dtype, dflt=None): + """Create an Atom from a NumPy dtype. + + An optional default value may be specified as the dflt + argument. Information in the dtype not represented in an Atom is + ignored:: + + >>> import numpy as np + >>> Atom.from_dtype(np.dtype((np.int16, (2, 2)))) + Int16Atom(shape=(2, 2), dflt=0) + >>> Atom.from_dtype(np.dtype('float64')) + Float64Atom(shape=(), dflt=0.0) + + Note: for easier use in Python 3, where all strings lead to the + Unicode dtype, this dtype will also generate a StringAtom. Since + this is only viable for strings that are castable as ascii, a + warning is issued. + + >>> Atom.from_dtype(np.dtype('U20')) # doctest: +SKIP + Atom.py:392: FlavorWarning: support for unicode type is very + limited, and only works for strings that can be cast as ascii + StringAtom(itemsize=20, shape=(), dflt=b'') + + """ + basedtype = dtype.base + if basedtype.names: + raise ValueError("compound data types are not supported: %r" + % dtype) + if basedtype.shape != (): + raise ValueError("nested data types are not supported: %r" + % dtype) + if basedtype.kind == 'S': # can not reuse something like 'string80' + itemsize = basedtype.itemsize + return cls.from_kind('string', itemsize, dtype.shape, dflt) + elif basedtype.kind == 'U': + # workaround for unicode type (standard string type in Python 3) + warnings.warn("support for unicode type is very limited, and " + "only works for strings that can be cast as ascii", + FlavorWarning) + itemsize = basedtype.itemsize // 4 + assert str(itemsize) in basedtype.str, ( + "something went wrong in handling unicode.") + return cls.from_kind('string', itemsize, dtype.shape, dflt) + # Most NumPy types have direct correspondence with PyTables types. + return cls.from_type(basedtype.name, dtype.shape, dflt) + + @classmethod + def from_type(cls, type, shape=(), dflt=None): + """Create an Atom from a PyTables type. + + Optional shape and default value may be specified as the + shape and dflt arguments, respectively:: + + >>> Atom.from_type('bool') + BoolAtom(shape=(), dflt=False) + >>> Atom.from_type('int16', shape=(2, 2)) + Int16Atom(shape=(2, 2), dflt=0) + >>> Atom.from_type('string40', dflt='hello') + Traceback (most recent call last): + ... + ValueError: unknown type: 'string40' + >>> Atom.from_type('Float64') + Traceback (most recent call last): + ... + ValueError: unknown type: 'Float64' + + """ + + if type not in all_types: + raise ValueError(f"unknown type: {type!r}") + kind, itemsize = split_type(type) + return cls.from_kind(kind, itemsize, shape, dflt) + + @classmethod + def from_kind(cls, kind, itemsize=None, shape=(), dflt=None): + """Create an Atom from a PyTables kind. + + Optional item size, shape and default value may be + specified as the itemsize, shape and dflt + arguments, respectively. Bear in mind that not all atoms support + a default item size:: + + >>> Atom.from_kind('int', itemsize=2, shape=(2, 2)) + Int16Atom(shape=(2, 2), dflt=0) + >>> Atom.from_kind('int', shape=(2, 2)) + Int32Atom(shape=(2, 2), dflt=0) + >>> Atom.from_kind('int', shape=1) + Int32Atom(shape=(1,), dflt=0) + >>> Atom.from_kind('string', dflt=b'hello') + Traceback (most recent call last): + ... + ValueError: no default item size for kind ``string`` + >>> Atom.from_kind('Float') + Traceback (most recent call last): + ... + ValueError: unknown kind: 'Float' + + Moreover, some kinds with atypical constructor signatures + are not supported; you need to use the proper + constructor:: + + >>> Atom.from_kind('enum') #doctest: +ELLIPSIS + Traceback (most recent call last): + ... + ValueError: the ``enum`` kind is not supported... + + """ + + kwargs = {'shape': shape} + if kind not in atom_map: + raise ValueError(f"unknown kind: {kind!r}") + # This incompatibility detection may get out-of-date and is + # too hard-wired, but I couldn't come up with something + # smarter. -- Ivan (2007-02-08) + if kind in ['enum']: + raise ValueError("the ``%s`` kind is not supported; " + "please use the appropriate constructor" + % kind) + # If no `itemsize` is given, try to get the default type of the + # kind (which has a fixed item size). + if itemsize is None: + if kind not in deftype_from_kind: + raise ValueError("no default item size for kind ``%s``" + % kind) + type_ = deftype_from_kind[kind] + kind, itemsize = split_type(type_) + kdata = atom_map[kind] + # Look up the class and set a possible item size. + if hasattr(kdata, 'kind'): # atom class: non-fixed item size + atomclass = kdata + kwargs['itemsize'] = itemsize + else: # dictionary: fixed item size + if itemsize not in kdata: + raise _invalid_itemsize_error(kind, itemsize, kdata) + atomclass = kdata[itemsize] + # Only set a `dflt` argument if given (`None` may not be understood). + if dflt is not None: + kwargs['dflt'] = dflt + + return atomclass(**kwargs) + + @property + def size(self): + """Total size in bytes of the atom.""" + return self.dtype.itemsize + + @property + def recarrtype(self): + """String type to be used in numpy.rec.array().""" + return str(self.dtype.shape) + self.dtype.base.str[1:] + + @property + def ndim(self): + """The number of dimensions of the atom. + + .. versionadded:: 2.4""" + return len(self.shape) + + def __init__(self, nptype, shape, dflt): + if not hasattr(self, 'type'): + raise NotImplementedError("``%s`` is an abstract class; " + "please use one of its subclasses" + % self.__class__.__name__) + self.shape = shape = _normalize_shape(shape) + """The shape of the atom (a tuple for scalar atoms).""" + # Curiously enough, NumPy isn't generally able to accept NumPy + # integers in a shape. ;( + npshape = tuple(int(s) for s in shape) + self.dtype = dtype = np.dtype((nptype, npshape)) + """The NumPy dtype that most closely matches this atom.""" + self.dflt = _normalize_default(dflt, dtype) + """The default value of the atom. + + If the user does not supply a value for an element while + filling a dataset, this default value will be written to + disk. If the user supplies a scalar value for a + multidimensional atom, this value is automatically *broadcast* + to all the items in the atom cell. If dflt is not supplied, an + appropriate zero value (or *null* string) will be chosen by + default. Please note that default values are kept internally + as NumPy objects.""" + + def __repr__(self): + args = f'shape={self.shape}, dflt={self.dflt!r}' + if not hasattr(self.__class__.itemsize, '__int__'): # non-fixed + args = f'itemsize={self.itemsize}, {args}' + return f'{self.__class__.__name__}({args})' + + __eq__ = _cmp_dispatcher('_is_equal_to_atom') + + def __ne__(self, other): + return not self.__eq__(other) + + # XXX: API incompatible change for PyTables 3 line + # Overriding __eq__ blocks inheritance of __hash__ in 3.x + # def __hash__(self): + # return hash((self.__class__, self.type, self.shape, self.itemsize, + # self.dflt)) + + def copy(self, **override): + """Get a copy of the atom, possibly overriding some arguments. + + Constructor arguments to be overridden must be passed as + keyword arguments:: + + >>> atom1 = Int32Atom(shape=12) + >>> atom2 = atom1.copy() + >>> print(atom1) + Int32Atom(shape=(12,), dflt=0) + >>> print(atom2) + Int32Atom(shape=(12,), dflt=0) + >>> atom1 is atom2 + False + >>> atom3 = atom1.copy(shape=(2, 2)) + >>> print(atom3) + Int32Atom(shape=(2, 2), dflt=0) + >>> atom1.copy(foobar=42) #doctest: +ELLIPSIS + Traceback (most recent call last): + ... + TypeError: ...__init__() got an unexpected keyword argument 'foobar' + + """ + newargs = self._get_init_args() + newargs.update(override) + return self.__class__(**newargs) + + def _get_init_args(self): + """Get a dictionary of instance constructor arguments. + + This implementation works on classes which use the same names + for both constructor arguments and instance attributes. + + """ + signature = inspect.signature(self.__init__) + parameters = signature.parameters + args = [arg for arg, p in parameters.items() + if p.kind is p.POSITIONAL_OR_KEYWORD] + + return {arg: getattr(self, arg) for arg in args if arg != 'self'} + + def _is_equal_to_atom(self, atom): + """Is this object equal to the given `atom`?""" + + return (self.type == atom.type and self.shape == atom.shape + and self.itemsize == atom.itemsize + and np.all(self.dflt == atom.dflt)) + + +class StringAtom(Atom): + """Defines an atom of type string. + + The item size is the *maximum* length in characters of strings. + + """ + + kind = 'string' + type = 'string' + _defvalue = b'' + + @property + def itemsize(self): + """Size in bytes of a sigle item in the atom.""" + return self.dtype.base.itemsize + + def __init__(self, itemsize, shape=(), dflt=_defvalue): + if not hasattr(itemsize, '__int__') or int(itemsize) < 0: + raise ValueError("invalid item size for kind ``%s``: %r; " + "it must be a positive integer" + % ('string', itemsize)) + Atom.__init__(self, 'S%d' % itemsize, shape, dflt) + + +class BoolAtom(Atom): + """Defines an atom of type bool.""" + + kind = 'bool' + itemsize = 1 + type = 'bool' + _deftype = 'bool8' + _defvalue = False + + def __init__(self, shape=(), dflt=_defvalue): + Atom.__init__(self, self.type, shape, dflt) + + +class IntAtom(Atom): + """Defines an atom of a signed integral type (int kind).""" + + kind = 'int' + signed = True + _deftype = 'int32' + _defvalue = 0 + __init__ = _abstract_atom_init(_deftype, _defvalue) + + +class UIntAtom(Atom): + """Defines an atom of an unsigned integral type (uint kind).""" + + kind = 'uint' + signed = False + _deftype = 'uint32' + _defvalue = 0 + __init__ = _abstract_atom_init(_deftype, _defvalue) + + +class FloatAtom(Atom): + """Defines an atom of a floating point type (float kind).""" + + kind = 'float' + _deftype = 'float64' + _defvalue = 0.0 + __init__ = _abstract_atom_init(_deftype, _defvalue) + + +def _create_numeric_class(baseclass, itemsize): + """Create a numeric atom class with the given `baseclass` and an + `itemsize`.""" + + prefix = '%s%d' % (baseclass.prefix(), itemsize * 8) + type_ = prefix.lower() + classdict = {'itemsize': itemsize, 'type': type_, + '__doc__': "Defines an atom of type ``%s``." % type_} + + def __init__(self, shape=(), dflt=baseclass._defvalue): + Atom.__init__(self, self.type, shape, dflt) + classdict['__init__'] = __init__ + return type('%sAtom' % prefix, (baseclass,), classdict) + + +Int8Atom = _create_numeric_class(IntAtom, 1) +Int16Atom = _create_numeric_class(IntAtom, 2) +Int32Atom = _create_numeric_class(IntAtom, 4) +Int64Atom = _create_numeric_class(IntAtom, 8) +UInt8Atom = _create_numeric_class(UIntAtom, 1) +UInt16Atom = _create_numeric_class(UIntAtom, 2) +UInt32Atom = _create_numeric_class(UIntAtom, 4) +UInt64Atom = _create_numeric_class(UIntAtom, 8) + +if hasattr(np, 'float16'): + Float16Atom = _create_numeric_class(FloatAtom, 2) +Float32Atom = _create_numeric_class(FloatAtom, 4) +Float64Atom = _create_numeric_class(FloatAtom, 8) +if hasattr(np, 'float96'): + Float96Atom = _create_numeric_class(FloatAtom, 12) +if hasattr(np, 'float128'): + Float128Atom = _create_numeric_class(FloatAtom, 16) + + +class ComplexAtom(Atom): + """Defines an atom of kind complex. + + Allowed item sizes are 8 (single precision) and 16 (double precision). This + class must be used instead of more concrete ones to avoid confusions with + numarray-like precision specifications used in PyTables 1.X. + + """ + + # This definition is a little more complex (no pun intended) + # because, although the complex kind is a normal numerical one, + # the usage of bottom-level classes is artificially forbidden. + # Everything will be back to normality when people has stopped + # using the old bottom-level complex classes. + + kind = 'complex' + _deftype = 'complex128' + _defvalue = 0j + _isizes = [8, 16] + + @property + def itemsize(self): + """Size in bytes of a sigle item in the atom.""" + return self.dtype.base.itemsize + + # Only instances have a `type` attribute, so complex types must be + # registered by hand. + all_types.add('complex64') + all_types.add('complex128') + if hasattr(np, 'complex192'): + all_types.add('complex192') + _isizes.append(24) + if hasattr(np, 'complex256'): + all_types.add('complex256') + _isizes.append(32) + + def __init__(self, itemsize, shape=(), dflt=_defvalue): + if itemsize not in self._isizes: + raise _invalid_itemsize_error('complex', itemsize, self._isizes) + self.type = '%s%d' % (self.kind, itemsize * 8) + Atom.__init__(self, self.type, shape, dflt) + + +class _ComplexErrorAtom(ComplexAtom, metaclass=type): + """Reminds the user to stop using the old complex atom names.""" + + def __init__(self, shape=(), dflt=ComplexAtom._defvalue): + raise TypeError( + "to avoid confusions with PyTables 1.X complex atom names, " + "please use ``ComplexAtom(itemsize=N)``, " + "where N=8 for single precision complex atoms, " + "and N=16 for double precision complex atoms") + + +Complex32Atom = Complex64Atom = Complex128Atom = _ComplexErrorAtom +if hasattr(np, 'complex192'): + Complex192Atom = _ComplexErrorAtom +if hasattr(np, 'complex256'): + Complex256Atom = _ComplexErrorAtom + + +class TimeAtom(Atom): + """Defines an atom of time type (time kind). + + There are two distinct supported types of time: a 32 bit integer value and + a 64 bit floating point value. Both of them reflect the number of seconds + since the Unix epoch. This atom has the property of being stored using the + HDF5 time datatypes. + + """ + + kind = 'time' + _deftype = 'time32' + _defvalue = 0 + __init__ = _abstract_atom_init(_deftype, _defvalue) + + +class Time32Atom(TimeAtom): + """Defines an atom of type time32.""" + + itemsize = 4 + type = 'time32' + _defvalue = 0 + + def __init__(self, shape=(), dflt=_defvalue): + Atom.__init__(self, 'int32', shape, dflt) + + +class Time64Atom(TimeAtom): + """Defines an atom of type time64.""" + + itemsize = 8 + type = 'time64' + _defvalue = 0.0 + + def __init__(self, shape=(), dflt=_defvalue): + Atom.__init__(self, 'float64', shape, dflt) + + +class EnumAtom(Atom): + """Description of an atom of an enumerated type. + + Instances of this class describe the atom type used to store enumerated + values. Those values belong to an enumerated type, defined by the first + argument (enum) in the constructor of the atom, which accepts the same + kinds of arguments as the Enum class (see :ref:`EnumClassDescr`). The + enumerated type is stored in the enum attribute of the atom. + + A default value must be specified as the second argument (dflt) in the + constructor; it must be the *name* (a string) of one of the enumerated + values in the enumerated type. When the atom is created, the corresponding + concrete value is broadcast and stored in the dflt attribute (setting + different default values for items in a multidimensional atom is not + supported yet). If the name does not match any value in the enumerated + type, a KeyError is raised. + + Another atom must be specified as the base argument in order to determine + the base type used for storing the values of enumerated values in memory + and disk. This *storage atom* is kept in the base attribute of the created + atom. As a shorthand, you may specify a PyTables type instead of the + storage atom, implying that this has a scalar shape. + + The storage atom should be able to represent each and every concrete value + in the enumeration. If it is not, a TypeError is raised. The default value + of the storage atom is ignored. + + The type attribute of enumerated atoms is always enum. + + Enumerated atoms also support comparisons with other objects:: + + >>> enum = ['T0', 'T1', 'T2'] + >>> atom1 = EnumAtom(enum, 'T0', 'int8') # same as ``atom2`` + >>> atom2 = EnumAtom(enum, 'T0', Int8Atom()) # same as ``atom1`` + >>> atom3 = EnumAtom(enum, 'T0', 'int16') + >>> atom4 = Int8Atom() + >>> atom1 == enum + False + >>> atom1 == atom2 + True + >>> atom2 != atom1 + False + >>> atom1 == atom3 + False + >>> atom1 == atom4 + False + >>> atom4 != atom1 + True + + Examples + -------- + + The next C enum construction:: + + enum myEnum { + T0, + T1, + T2 + }; + + would correspond to the following PyTables + declaration:: + + >>> my_enum_atom = EnumAtom(['T0', 'T1', 'T2'], 'T0', 'int32') + + Please note the dflt argument with a value of 'T0'. Since the concrete + value matching T0 is unknown right now (we have not used explicit concrete + values), using the name is the only option left for defining a default + value for the atom. + + The chosen representation of values for this enumerated atom uses unsigned + 32-bit integers, which surely wastes quite a lot of memory. Another size + could be selected by using the base argument (this time with a full-blown + storage atom):: + + >>> my_enum_atom = EnumAtom(['T0', 'T1', 'T2'], 'T0', UInt8Atom()) + + You can also define multidimensional arrays for data elements:: + + >>> my_enum_atom = EnumAtom( + ... ['T0', 'T1', 'T2'], 'T0', base='uint32', shape=(3,2)) + + for 3x2 arrays of uint32. + + """ + + # Registering this class in the class map may be a little wrong, + # since the ``Atom.from_kind()`` method fails miserably with + # enumerations, as they don't support an ``itemsize`` argument. + # However, resetting ``__metaclass__`` to ``type`` doesn't seem to + # work and I don't feel like creating a subclass of ``MetaAtom``. + + kind = 'enum' + type = 'enum' + + @property + def itemsize(self): + """Size in bytes of a single item in the atom.""" + return self.dtype.base.itemsize + + def _checkbase(self, base): + """Check the `base` storage atom.""" + + if base.kind == 'enum': + raise TypeError("can not use an enumerated atom " + "as a storage atom: %r" % base) + + # Check whether the storage atom can represent concrete values + # in the enumeration... + basedtype = base.dtype + pyvalues = [value for (name, value) in self.enum] + try: + npgenvalues = np.array(pyvalues) + except ValueError: + raise TypeError("concrete values are not uniformly-shaped") + try: + npvalues = np.array(npgenvalues, dtype=basedtype.base) + except ValueError: + raise TypeError("storage atom type is incompatible with " + "concrete values in the enumeration") + if npvalues.shape[1:] != basedtype.shape: + raise TypeError("storage atom shape does not match that of " + "concrete values in the enumeration") + if npvalues.tolist() != npgenvalues.tolist(): + raise TypeError("storage atom type lacks precision for " + "concrete values in the enumeration") + + # ...with some implementation limitations. + if npvalues.dtype.kind not in ['i', 'u']: + raise NotImplementedError("only integer concrete values " + "are supported for the moment, sorry") + if len(npvalues.shape) > 1: + raise NotImplementedError("only scalar concrete values " + "are supported for the moment, sorry") + + def _get_init_args(self): + """Get a dictionary of instance constructor arguments.""" + + return dict(enum=self.enum, dflt=self._defname, + base=self.base, shape=self.shape) + + def _is_equal_to_atom(self, atom): + """Is this object equal to the given `atom`?""" + + return False + + def _is_equal_to_enumatom(self, enumatom): + """Is this object equal to the given `enumatom`?""" + + return (self.enum == enumatom.enum and self.shape == enumatom.shape + and np.all(self.dflt == enumatom.dflt) + and self.base == enumatom.base) + + def __init__(self, enum, dflt, base, shape=()): + if not isinstance(enum, Enum): + enum = Enum(enum) + self.enum = enum + + if isinstance(base, str): + base = Atom.from_type(base) + self._checkbase(base) + self.base = base + + default = enum[dflt] # check default value + self._defname = dflt # kept for representation purposes + + # These are kept to ease dumping this particular + # representation of the enumeration to storage. + names, values = [], [] + for (name, value) in enum: + names.append(name) + values.append(value) + basedtype = self.base.dtype + + self._names = names + self._values = np.array(values, dtype=basedtype.base) + + Atom.__init__(self, basedtype, shape, default) + + def __repr__(self): + return ('EnumAtom(enum=%r, dflt=%r, base=%r, shape=%r)' + % (self.enum, self._defname, self.base, self.shape)) + + __eq__ = _cmp_dispatcher('_is_equal_to_enumatom') + + # XXX: API incompatible change for PyTables 3 line + # Overriding __eq__ blocks inheritance of __hash__ in 3.x + # def __hash__(self): + # return hash((self.__class__, self.enum, self.shape, self.dflt, + # self.base)) + + +class ReferenceAtom(Atom): + """Defines an atom of type object to read references. + This atom is read-only. + """ + + kind = 'reference' + type = 'object' + _deftype = 'NoneType' + _defvalue = None + + @property + def itemsize(self): + """Size in bytes of a single item in the atom.""" + return self.dtype.base.itemsize + + def __init__(self, shape=()): + Atom.__init__(self, self.type, shape, self._defvalue) + + def __repr__(self): + return f'ReferenceAtom(shape={self.shape})' + +# Pseudo-atom classes +# =================== +# +# Now, there come three special classes, `ObjectAtom`, `VLStringAtom` +# and `VLUnicodeAtom`, that actually do not descend from `Atom`, but +# which goal is so similar that they should be described here. +# Pseudo-atoms can only be used with `VLArray` datasets, and they do +# not support multidimensional values, nor multiple values per row. +# +# They can be recognised because they also have ``kind``, ``type`` and +# ``shape`` attributes, but no ``size``, ``itemsize`` or ``dflt`` +# ones. Instead, they have a ``base`` atom which defines the elements +# used for storage. +# +# See ``examples/vlarray1.py`` and ``examples/vlarray2.py`` for +# further examples on `VLArray` datasets, including object +# serialization and string management. + + +class PseudoAtom: + """Pseudo-atoms can only be used in ``VLArray`` nodes. + + They can be recognised because they also have `kind`, `type` and + `shape` attributes, but no `size`, `itemsize` or `dflt` ones. + Instead, they have a `base` atom which defines the elements used + for storage. + """ + + def __repr__(self): + return '%s()' % self.__class__.__name__ + + def toarray(self, object_): + """Convert an `object_` into an array of base atoms.""" + + raise NotImplementedError + + def fromarray(self, array): + """Convert an `array` of base atoms into an object.""" + + raise NotImplementedError + + +class _BufferedAtom(PseudoAtom): + """Pseudo-atom which stores data as a buffer (flat array of uints).""" + + shape = () + + def toarray(self, object_): + buffer_ = self._tobuffer(object_) + array = np.ndarray(buffer=buffer_, dtype=self.base.dtype, + shape=len(buffer_)) + return array + + def _tobuffer(self, object_): + """Convert an `object_` into a buffer.""" + + raise NotImplementedError + + +class VLStringAtom(_BufferedAtom): + """Defines an atom of type ``vlstring``. + + This class describes a *row* of the VLArray class, rather than an atom. It + differs from the StringAtom class in that you can only add *one instance of + it to one specific row*, i.e. the :meth:`VLArray.append` method only + accepts one object when the base atom is of this type. + + This class stores bytestrings. It does not make assumptions on the + encoding of the string, and raw bytes are stored as is. To store a string + you will need to *explicitly* convert it to a bytestring before you can + save them:: + + >>> s = 'A unicode string: hbar = \u210f' + >>> bytestring = s.encode('utf-8') + >>> VLArray.append(bytestring) # doctest: +SKIP + + For full Unicode support, using VLUnicodeAtom (see :ref:`VLUnicodeAtom`) is + recommended. + + Variable-length string atoms do not accept parameters and they cause the + reads of rows to always return Python bytestrings. You can regard vlstring + atoms as an easy way to save generic variable length strings. + + """ + + kind = 'vlstring' + type = 'vlstring' + base = UInt8Atom() + + def _tobuffer(self, object_): + if isinstance(object_, str): + warnings.warn("Storing non bytestrings in VLStringAtom is " + "deprecated.", DeprecationWarning) + elif not isinstance(object_, bytes): + raise TypeError(f"object is not a string: {object_!r}") + return np.string_(object_) + + def fromarray(self, array): + return array.tobytes() + + +class VLUnicodeAtom(_BufferedAtom): + """Defines an atom of type vlunicode. + + This class describes a *row* of the VLArray class, rather than an atom. It + is very similar to VLStringAtom (see :ref:`VLStringAtom`), but it stores + Unicode strings (using 32-bit characters a la UCS-4, so all strings of the + same length also take up the same space). + + This class does not make assumptions on the encoding of plain input + strings. Plain strings are supported as long as no character is out of the + ASCII set; otherwise, you will need to *explicitly* convert them to Unicode + before you can save them. + + Variable-length Unicode atoms do not accept parameters and they cause the + reads of rows to always return Python Unicode strings. You can regard + vlunicode atoms as an easy way to save variable length Unicode strings. + + """ + + kind = 'vlunicode' + type = 'vlunicode' + base = UInt32Atom() + + # numpy.unicode_ no more implements the buffer interface in Python 3 + # + # When the Python build is UCS-2, we need to promote the + # Unicode string to UCS-4. We *must* use a 0-d array since + # NumPy scalars inherit the UCS-2 encoding from Python (see + # NumPy ticket #525). Since ``_tobuffer()`` can't return an + # array, we must override ``toarray()`` itself. + def toarray(self, object_): + if isinstance(object_, bytes): + warnings.warn("Storing bytestrings in VLUnicodeAtom is " + "deprecated.", DeprecationWarning) + elif not isinstance(object_, str): + raise TypeError(f"object is not a string: {object_!r}") + ustr = str(object_) + uarr = np.array(ustr, dtype='U') + return np.ndarray( + buffer=uarr, dtype=self.base.dtype, shape=len(ustr)) + + def _tobuffer(self, object_): + # This works (and is used) only with UCS-4 builds of Python, + # where the width of the internal representation of a + # character matches that of the base atoms. + if isinstance(object_, bytes): + warnings.warn("Storing bytestrings in VLUnicodeAtom is " + "deprecated.", DeprecationWarning) + elif not isinstance(object_, str): + raise TypeError(f"object is not a string: {object_!r}") + return np.unicode_(object_) + + def fromarray(self, array): + length = len(array) + if length == 0: + return '' # ``array.view('U0')`` raises a `TypeError` + return array.view('U%d' % length).item() + + +class ObjectAtom(_BufferedAtom): + """Defines an atom of type object. + + This class is meant to fit *any* kind of Python object in a row of a + VLArray dataset by using pickle behind the scenes. Due to the fact that + you can not foresee how long will be the output of the pickle + serialization (i.e. the atom already has a *variable* length), you can only + fit *one object per row*. However, you can still group several objects in a + single tuple or list and pass it to the :meth:`VLArray.append` method. + + Object atoms do not accept parameters and they cause the reads of rows to + always return Python objects. You can regard object atoms as an easy way to + save an arbitrary number of generic Python objects in a VLArray dataset. + + """ + + kind = 'object' + type = 'object' + base = UInt8Atom() + + def _tobuffer(self, object_): + return pickle.dumps(object_, pickle.HIGHEST_PROTOCOL) + + def fromarray(self, array): + # We have to check for an empty array because of a possible + # bug in HDF5 which makes it claim that a dataset has one + # record when in fact it is empty. + if array.size == 0: + return None + return pickle.loads(array.tobytes()) diff --git a/tables/attributeset.py b/tables/attributeset.py new file mode 100644 index 0000000..6217ac6 --- /dev/null +++ b/tables/attributeset.py @@ -0,0 +1,688 @@ +"""Here is defined the AttributeSet class.""" + +import re +import sys +import warnings +import pickle +import numpy as np + +from . import hdf5extension +from .utils import SizeType +from .registry import class_name_dict +from .exceptions import ClosedNodeError, PerformanceWarning +from .path import check_attribute_name +from .undoredo import attr_to_shadow +from .filters import Filters + + +# System attributes +SYS_ATTRS = ["CLASS", "VERSION", "TITLE", "NROWS", "EXTDIM", + "ENCODING", "PYTABLES_FORMAT_VERSION", + "FLAVOR", "FILTERS", "AUTO_INDEX", + "DIRTY", "NODE_TYPE", "NODE_TYPE_VERSION", + "PSEUDOATOM"] +# Prefixes of other system attributes +SYS_ATTRS_PREFIXES = ["FIELD_"] +# RO_ATTRS will be disabled and let the user modify them if they +# want to. The user is still not allowed to remove or rename +# system attributes. Francesc Alted 2004-12-19 +# Read-only attributes: +# RO_ATTRS = ["CLASS", "FLAVOR", "VERSION", "NROWS", "EXTDIM", +# "PYTABLES_FORMAT_VERSION", "FILTERS", +# "NODE_TYPE", "NODE_TYPE_VERSION"] +# RO_ATTRS = [] + +# The next attributes are not meant to be copied during a Node copy process +SYS_ATTRS_NOTTOBECOPIED = ["CLASS", "VERSION", "TITLE", "NROWS", "EXTDIM", + "PYTABLES_FORMAT_VERSION", "FILTERS", "ENCODING"] +# Attributes forced to be copied during node copies +FORCE_COPY_CLASS = ['CLASS', 'VERSION'] +# Regular expression for column default values. +_field_fill_re = re.compile('^FIELD_[0-9]+_FILL$') +# Regular expression for fixing old pickled filters. +_old_filters_re = re.compile(br'\(([ic])tables\.Leaf\n') +# Fixed version of the previous string. +_new_filters_sub = br'(\1tables.filters\n' + + +def issysattrname(name): + """Check if a name is a system attribute or not""" + + return bool(name in SYS_ATTRS or np.prod( + [name.startswith(prefix) for prefix in SYS_ATTRS_PREFIXES])) + + +class AttributeSet(hdf5extension.AttributeSet): + """Container for the HDF5 attributes of a Node. + + This class provides methods to create new HDF5 node attributes, + and to get, rename or delete existing ones. + + Like in Group instances (see :ref:`GroupClassDescr`), AttributeSet + instances make use of the *natural naming* convention, i.e. you can + access the attributes on disk as if they were normal Python + attributes of the AttributeSet instance. + + This offers the user a very convenient way to access HDF5 node + attributes. However, for this reason and in order not to pollute the + object namespace, one can not assign *normal* attributes to + AttributeSet instances, and their members use names which start by + special prefixes as happens with Group objects. + + .. rubric:: Notes on native and pickled attributes + + The values of most basic types are saved as HDF5 native data in the + HDF5 file. This includes Python bool, int, float, complex and str + (but not long nor unicode) values, as well as their NumPy scalar + versions and homogeneous or *structured* NumPy arrays of them. When + read, these values are always loaded as NumPy scalar or array + objects, as needed. + + For that reason, attributes in native HDF5 files will be always + mapped into NumPy objects. Specifically, a multidimensional + attribute will be mapped into a multidimensional ndarray and a + scalar will be mapped into a NumPy scalar object (for example, a + scalar H5T_NATIVE_LLONG will be read and returned as a numpy.int64 + scalar). + + However, other kinds of values are serialized using pickle, so you + only will be able to correctly retrieve them using a Python-aware + HDF5 library. Thus, if you want to save Python scalar values and + make sure you are able to read them with generic HDF5 tools, you + should make use of *scalar or homogeneous/structured array NumPy + objects* (for example, numpy.int64(1) or numpy.array([1, 2, 3], + dtype='int16')). + + One more advice: because of the various potential difficulties in + restoring a Python object stored in an attribute, you may end up + getting a pickle string where a Python object is expected. If this + is the case, you may wish to run pickle.loads() on that string to + get an idea of where things went wrong, as shown in this example:: + + >>> import os, tempfile + >>> import tables as tb + >>> + >>> class MyClass: + ... foo = 'bar' + ... + >>> myObject = MyClass() # save object of custom class in HDF5 attr + >>> h5fname = tempfile.mktemp(suffix='.h5') + >>> h5f = tb.open_file(h5fname, 'w') + >>> h5f.root._v_attrs.obj = myObject # store the object + >>> print(h5f.root._v_attrs.obj.foo) # retrieve it + bar + >>> h5f.close() + >>> + >>> del MyClass, myObject # delete class of object and reopen file + >>> h5f = tb.open_file(h5fname, 'r') + >>> print(repr(h5f.root._v_attrs.obj)) + b'ccopy_reg\\n_reconstructor... + >>> import pickle # let's unpickle that to see what went wrong + >>> pickle.loads(h5f.root._v_attrs.obj) + Traceback (most recent call last): + ... + AttributeError: Can't get attribute 'MyClass' ... + >>> # So the problem was not in the stored object, + ... # but in the *environment* where it was restored. + ... h5f.close() + >>> os.remove(h5fname) + + + .. rubric:: Notes on AttributeSet methods + + Note that this class overrides the __getattr__(), __setattr__(), + __delattr__() and __dir__() special methods. This allows you to + read, assign or delete attributes on disk by just using the next + constructs:: + + leaf.attrs.myattr = 'str attr' # set a string (native support) + leaf.attrs.myattr2 = 3 # set an integer (native support) + leaf.attrs.myattr3 = [3, (1, 2)] # a generic object (Pickled) + attrib = leaf.attrs.myattr # get the attribute ``myattr`` + del leaf.attrs.myattr # delete the attribute ``myattr`` + + In addition, the dictionary-like __getitem__(), __setitem__() and + __delitem__() methods are available, so you may write things like + this:: + + for name in node._v_attrs._f_list(): + print("name: %s, value: %s" % (name, node._v_attrs[name])) + + Use whatever idiom you prefer to access the attributes. + + Finally, on interactive python sessions you may get autocompletions of + attributes named as *valid python identifiers* by pressing the `[Tab]` + key, or to use the dir() global function. + + If an attribute is set on a target node that already has a large + number of attributes, a PerformanceWarning will be issued. + + + .. rubric:: AttributeSet attributes + + .. attribute:: _v_attrnames + + A list with all attribute names. + + .. attribute:: _v_attrnamessys + + A list with system attribute names. + + .. attribute:: _v_attrnamesuser + + A list with user attribute names. + + .. attribute:: _v_unimplemented + + A list of attribute names with unimplemented native HDF5 types. + + """ + + def _g_getnode(self): + return self._v__nodefile._get_node(self._v__nodepath) + + @property + def _v_node(self): + """The :class:`Node` instance this attribute set is associated with.""" + return self._g_getnode() + + def __init__(self, node): + """Create the basic structures to keep the attribute information. + + Reads all the HDF5 attributes (if any) on disk for the node "node". + + Parameters + ---------- + node + The parent node + + """ + + # Refuse to create an instance of an already closed node + if not node._v_isopen: + raise ClosedNodeError("the node for attribute set is closed") + + dict_ = self.__dict__ + + self._g_new(node) + dict_["_v__nodefile"] = node._v_file + dict_["_v__nodepath"] = node._v_pathname + dict_["_v_attrnames"] = self._g_list_attr(node) + # The list of unimplemented attribute names + dict_["_v_unimplemented"] = [] + + # Get the file version format. This is an optimization + # in order to avoid accessing it too much. + try: + format_version = node._v_file.format_version + except AttributeError: + parsed_version = None + else: + if format_version == 'unknown': + parsed_version = None + else: + parsed_version = tuple(map(int, format_version.split('.'))) + dict_["_v__format_version"] = parsed_version + # Split the attribute list in system and user lists + dict_["_v_attrnamessys"] = [] + dict_["_v_attrnamesuser"] = [] + for attr in self._v_attrnames: + # put the attributes on the local dictionary to allow + # tab-completion + self.__getattr__(attr) + if issysattrname(attr): + self._v_attrnamessys.append(attr) + else: + self._v_attrnamesuser.append(attr) + + # Sort the attributes + self._v_attrnames.sort() + self._v_attrnamessys.sort() + self._v_attrnamesuser.sort() + + def _g_update_node_location(self, node): + """Updates the location information about the associated `node`.""" + + dict_ = self.__dict__ + dict_['_v__nodefile'] = node._v_file + dict_['_v__nodepath'] = node._v_pathname + # hdf5extension operations: + self._g_new(node) + + def _f_list(self, attrset='user'): + """Get a list of attribute names. + + The attrset string selects the attribute set to be used. A + 'user' value returns only user attributes (this is the default). + A 'sys' value returns only system attributes. Finally, 'all' + returns both system and user attributes. + + """ + + if attrset == "user": + return self._v_attrnamesuser[:] + elif attrset == "sys": + return self._v_attrnamessys[:] + elif attrset == "all": + return self._v_attrnames[:] + + def __dir__(self): + """Autocomplete only children named as valid python identifiers. + + Only PY3 supports this special method. + """ + return list({c for c in + super().__dir__() + self._v_attrnames + if c.isidentifier()}) + + def __getattr__(self, name): + """Get the attribute named "name".""" + + # If attribute does not exist, raise AttributeError + if name not in self._v_attrnames: + raise AttributeError(f"Attribute {name!r} does not exist " + f"in node: {self._v__nodepath!r}") + + # Read the attribute from disk. This is an optimization to read + # quickly system attributes that are _string_ values, but it + # takes care of other types as well as for example NROWS for + # Tables and EXTDIM for EArrays + format_version = self._v__format_version + value = self._g_getattr(self._v_node, name) + + # Check whether the value is pickled + # Pickled values always seems to end with a "." + maybe_pickled = ( + isinstance(value, np.generic) and # NumPy scalar? + value.dtype.type == np.bytes_ and # string type? + value.itemsize > 0 and value.endswith(b'.')) + + if (maybe_pickled and value in [b"0", b"0."]): + # Workaround for a bug in many versions of Python (starting + # somewhere after Python 2.6.1). See ticket #253. + retval = value + elif (maybe_pickled and _field_fill_re.match(name) + and format_version == (1, 5)): + # This format was used during the first 1.2 releases, just + # for string defaults. + try: + retval = pickle.loads(value) + retval = np.array(retval) + except ImportError: + retval = None # signal error avoiding exception + elif (maybe_pickled and + name == 'FILTERS' and + format_version is not None and + format_version < (2, 0)): + # This is a big hack, but we don't have other way to recognize + # pickled filters of PyTables 1.x files. + value = _old_filters_re.sub(_new_filters_sub, value, 1) + retval = pickle.loads(value) # pass unpickling errors through + elif maybe_pickled: + try: + retval = pickle.loads(value) + # except cPickle.UnpicklingError: + # It seems that pickle may raise other errors than UnpicklingError + # Perhaps it would be better just an "except:" clause? + # except (cPickle.UnpicklingError, ImportError): + # Definitely (see SF bug #1254636) + except UnicodeDecodeError: + # Object maybe pickled on python 2 and unpickled on python 3. + # encoding='bytes' was added in python 3.4 to resolve this. + # However 'bytes' mangles class attributes as they are + # unplicked as bytestrings. Hence try 'latin1' first. + # Ref: http://bugs.python.org/issue6784 + try: + retval = pickle.loads(value, encoding='latin1') + except TypeError: + try: + retval = pickle.loads(value, encoding='bytes') + except Exception: + retval = value + except Exception: + retval = value + except Exception: + # catch other unpickling errors: + # ivb (2005-09-07): It is too hard to tell + # whether the unpickling failed + # because of the string not being a pickle one at all, + # because of a malformed pickle string, + # or because of some other problem in object reconstruction, + # thus making inconvenient even the issuing of a warning here. + # The documentation contains a note on this issue, + # explaining how the user can tell where the problem was. + retval = value + # Additional check for allowing a workaround for #307 + if isinstance(retval, str) and retval == '': + retval = np.array(retval)[()] + elif (name == 'FILTERS' and + format_version is not None and + format_version >= (2, 0)): + try: + retval = Filters._unpack(value) + except ValueError: + sys.stderr.write('Failed parsing FILTERS key\n') + sys.stderr.flush() + retval = None + elif name == 'TITLE' and not isinstance(value, str): + retval = value.decode('utf-8') + elif (issysattrname(name) and isinstance(value, (bytes, str)) and + not isinstance(value, str) and not _field_fill_re.match(name)): + # system attributes should always be str + # python 3, bytes and not "FIELD_[0-9]+_FILL" + retval = value.decode('utf-8') + else: + retval = value + + # Put this value in local directory + self.__dict__[name] = retval + return retval + + def _g__setattr(self, name, value): + """Set a PyTables attribute. + + Sets a (maybe new) PyTables attribute with the specified `name` + and `value`. If the attribute already exists, it is simply + replaced. + + It does not log the change. + + """ + + # Save this attribute to disk + # (overwriting an existing one if needed) + stvalue = value + if issysattrname(name): + if name in ["EXTDIM", "AUTO_INDEX", "DIRTY", "NODE_TYPE_VERSION"]: + stvalue = np.array(value, dtype=np.int32) + value = stvalue[()] + elif name == "NROWS": + stvalue = np.array(value, dtype=SizeType) + value = stvalue[()] + elif (name == "FILTERS" and + self._v__format_version is not None and + self._v__format_version >= (2, 0)): + stvalue = value._pack() + # value will remain as a Filters instance here + # Convert value from a Python scalar into a NumPy scalar + # (only in case it has not been converted yet) + # Fixes ticket #59 + if (stvalue is value and + type(value) in (bool, bytes, int, float, complex, str, + np.unicode_)): + # Additional check for allowing a workaround for #307 + if isinstance(value, str) and len(value) == 0: + stvalue = np.array('') + else: + stvalue = np.array(value) + value = stvalue[()] + + self._g_setattr(self._v_node, name, stvalue) + + # New attribute or value. Introduce it into the local + # directory + self.__dict__[name] = value + + # Finally, add this attribute to the list if not present + attrnames = self._v_attrnames + if name not in attrnames: + attrnames.append(name) + attrnames.sort() + if issysattrname(name): + attrnamessys = self._v_attrnamessys + attrnamessys.append(name) + attrnamessys.sort() + else: + attrnamesuser = self._v_attrnamesuser + attrnamesuser.append(name) + attrnamesuser.sort() + + def __setattr__(self, name, value): + """Set a PyTables attribute. + + Sets a (maybe new) PyTables attribute with the specified `name` + and `value`. If the attribute already exists, it is simply + replaced. + + A ``ValueError`` is raised when the name starts with a reserved + prefix or contains a ``/``. A `NaturalNameWarning` is issued if + the name is not a valid Python identifier. A + `PerformanceWarning` is issued when the recommended maximum + number of attributes in a node is going to be exceeded. + + """ + + nodefile = self._v__nodefile + attrnames = self._v_attrnames + + # Check for name validity + check_attribute_name(name) + + nodefile._check_writable() + + # Check if there are too many attributes. + max_node_attrs = nodefile.params['MAX_NODE_ATTRS'] + if len(attrnames) >= max_node_attrs: + warnings.warn("""\ +node ``%s`` is exceeding the recommended maximum number of attributes (%d);\ +be ready to see PyTables asking for *lots* of memory and possibly slow I/O""" + % (self._v__nodepath, max_node_attrs), + PerformanceWarning) + + undo_enabled = nodefile.is_undo_enabled() + # Log old attribute removal (if any). + if undo_enabled and (name in attrnames): + self._g_del_and_log(name) + + # Set the attribute. + self._g__setattr(name, value) + + # Log new attribute addition. + if undo_enabled: + self._g_log_add(name) + + def _g_log_add(self, name): + self._v__nodefile._log('ADDATTR', self._v__nodepath, name) + + def _g_del_and_log(self, name): + nodefile = self._v__nodefile + node_pathname = self._v__nodepath + # Log *before* moving to use the right shadow name. + nodefile._log('DELATTR', node_pathname, name) + attr_to_shadow(nodefile, node_pathname, name) + + def _g__delattr(self, name): + """Delete a PyTables attribute. + + Deletes the specified existing PyTables attribute. + + It does not log the change. + + """ + + # Delete the attribute from disk + self._g_remove(self._v_node, name) + + # Delete the attribute from local lists + self._v_attrnames.remove(name) + if name in self._v_attrnamessys: + self._v_attrnamessys.remove(name) + else: + self._v_attrnamesuser.remove(name) + + # Delete the attribute from the local directory + # closes (#1049285) + del self.__dict__[name] + + def __delattr__(self, name): + """Delete a PyTables attribute. + + Deletes the specified existing PyTables attribute from the + attribute set. If a nonexistent or system attribute is + specified, an ``AttributeError`` is raised. + + """ + + nodefile = self._v__nodefile + + # Check if attribute exists + if name not in self._v_attrnames: + raise AttributeError( + "Attribute ('%s') does not exist in node '%s'" + % (name, self._v__nodepath)) + + nodefile._check_writable() + + # Remove the PyTables attribute or move it to shadow. + if nodefile.is_undo_enabled(): + self._g_del_and_log(name) + else: + self._g__delattr(name) + + def __getitem__(self, name): + """The dictionary like interface for __getattr__().""" + + try: + return self.__getattr__(name) + except AttributeError: + # Capture the AttributeError an re-raise a KeyError one + raise KeyError( + "Attribute ('%s') does not exist in node '%s'" + % (name, self._v__nodepath)) + + def __setitem__(self, name, value): + """The dictionary like interface for __setattr__().""" + + self.__setattr__(name, value) + + def __delitem__(self, name): + """The dictionary like interface for __delattr__().""" + + try: + self.__delattr__(name) + except AttributeError: + # Capture the AttributeError an re-raise a KeyError one + raise KeyError( + "Attribute ('%s') does not exist in node '%s'" + % (name, self._v__nodepath)) + + def __contains__(self, name): + """Is there an attribute with that name? + + A true value is returned if the attribute set has an attribute + with the given name, false otherwise. + + """ + + return name in self._v_attrnames + + def _f_rename(self, oldattrname, newattrname): + """Rename an attribute from oldattrname to newattrname.""" + + if oldattrname == newattrname: + # Do nothing + return + + # First, fetch the value of the oldattrname + attrvalue = getattr(self, oldattrname) + + # Now, create the new attribute + setattr(self, newattrname, attrvalue) + + # Finally, remove the old attribute + delattr(self, oldattrname) + + def _g_copy(self, newset, set_attr=None, copyclass=False): + """Copy set attributes. + + Copies all user and allowed system PyTables attributes to the + given attribute set, replacing the existing ones. + + You can specify a *bound* method of the destination set that + will be used to set its attributes. Else, its `_g__setattr` + method will be used. + + Changes are logged depending on the chosen setting method. The + default setting method does not log anything. + + .. versionchanged:: 3.0 + The *newSet* parameter has been renamed into *newset*. + + .. versionchanged:: 3.0 + The *copyClass* parameter has been renamed into *copyclass*. + + """ + + copysysattrs = newset._v__nodefile.params['PYTABLES_SYS_ATTRS'] + if set_attr is None: + set_attr = newset._g__setattr + + for attrname in self._v_attrnamesuser: + # Do not copy the unimplemented attributes. + if attrname not in self._v_unimplemented: + set_attr(attrname, getattr(self, attrname)) + # Copy the system attributes that we are allowed to. + if copysysattrs: + for attrname in self._v_attrnamessys: + if ((attrname not in SYS_ATTRS_NOTTOBECOPIED) and + # Do not copy the FIELD_ attributes in tables as this can + # be really *slow* (don't know exactly the reason). + # See #304. + not attrname.startswith("FIELD_")): + set_attr(attrname, getattr(self, attrname)) + # Copy CLASS and VERSION attributes if requested + if copyclass: + for attrname in FORCE_COPY_CLASS: + if attrname in self._v_attrnamessys: + set_attr(attrname, getattr(self, attrname)) + + def _f_copy(self, where): + """Copy attributes to the where node. + + Copies all user and certain system attributes to the given where + node (a Node instance - see :ref:`NodeClassDescr`), replacing + the existing ones. + + """ + + # AttributeSet must be defined in order to define a Node. + # However, we need to know Node here. + # Using class_name_dict avoids a circular import. + if not isinstance(where, class_name_dict['Node']): + raise TypeError(f"destination object is not a node: {where!r}") + self._g_copy(where._v_attrs, where._v_attrs.__setattr__) + + def _g_close(self): + # Nothing will be done here, as the existing instance is completely + # operative now. + pass + + def __str__(self): + """The string representation for this object.""" + + # The pathname + pathname = self._v__nodepath + # Get this class name + classname = self.__class__.__name__ + # The attribute names + attrnumber = sum(1 for _ in self._v_attrnames) + return f"{pathname}._v_attrs ({classname}), {attrnumber} attributes" + + def __repr__(self): + """A detailed string representation for this object.""" + + # print additional info only if there are attributes to show + attrnames = list(self._v_attrnames) + if attrnames: + rep = [f'{attr} := {getattr(self, attr)!r}' for attr in attrnames] + return f"{self!s}:\n [" + ',\n '.join(rep) + "]" + else: + return str(self) + + +class NotLoggedAttributeSet(AttributeSet): + def _g_log_add(self, name): + pass + + def _g_del_and_log(self, name): + self._g__delattr(name) diff --git a/tables/carray.py b/tables/carray.py new file mode 100644 index 0000000..3d6d9db --- /dev/null +++ b/tables/carray.py @@ -0,0 +1,278 @@ +"""Here is defined the CArray class.""" + +import sys + +import numpy as np + +from .atom import Atom +from .array import Array +from .utils import correct_byteorder, SizeType + + +# default version for CARRAY objects +# obversion = "1.0" # Support for time & enumerated datatypes. +obversion = "1.1" # Numeric and numarray flavors are gone. + + +class CArray(Array): + """This class represents homogeneous datasets in an HDF5 file. + + The difference between a CArray and a normal Array (see + :ref:`ArrayClassDescr`), from which it inherits, is that a CArray + has a chunked layout and, as a consequence, it supports compression. + You can use datasets of this class to easily save or load arrays to + or from disk, with compression support included. + + CArray includes all the instance variables and methods of Array. + Only those with different behavior are mentioned here. + + Parameters + ---------- + parentnode + The parent :class:`Group` object. + + .. versionchanged:: 3.0 + Renamed from *parentNode* to *parentnode*. + + name : str + The name of this node in its parent group. + atom + An `Atom` instance representing the *type* and *shape* of + the atomic objects to be saved. + + shape + The shape of the new array. + + title + A description for this node (it sets the ``TITLE`` HDF5 + attribute on disk). + + filters + An instance of the `Filters` class that provides + information about the desired I/O filters to be applied + during the life of this object. + + chunkshape + The shape of the data chunk to be read or written in a + single HDF5 I/O operation. Filters are applied to those + chunks of data. The dimensionality of `chunkshape` must + be the same as that of `shape`. If ``None``, a sensible + value is calculated (which is recommended). + + byteorder + The byteorder of the data *on disk*, specified as 'little' + or 'big'. If this is not specified, the byteorder is that + of the platform. + + track_times + Whether time data associated with the leaf are recorded (object + access time, raw data modification time, metadata change time, object + birth time); default True. Semantics of these times depend on their + implementation in the HDF5 library: refer to documentation of the + H5O_info_t data structure. As of HDF5 1.8.15, only ctime (metadata + change time) is implemented. + + .. versionadded:: 3.4.3 + + Examples + -------- + + See below a small example of the use of the `CArray` class. + The code is available in ``examples/carray1.py``:: + + import numpy as np + import tables as tb + + fileName = 'carray1.h5' + shape = (200, 300) + atom = tb.UInt8Atom() + filters = tb.Filters(complevel=5, complib='zlib') + + h5f = tb.open_file(fileName, 'w') + ca = h5f.create_carray(h5f.root, 'carray', atom, shape, + filters=filters) + + # Fill a hyperslab in ``ca``. + ca[10:60, 20:70] = np.ones((50, 50)) + h5f.close() + + # Re-open a read another hyperslab + h5f = tb.open_file(fileName) + print(h5f) + print(h5f.root.carray[8:12, 18:22]) + h5f.close() + + The output for the previous script is something like:: + + carray1.h5 (File) '' + Last modif.: 'Thu Apr 12 10:15:38 2007' + Object Tree: + / (RootGroup) '' + /carray (CArray(200, 300), shuffle, zlib(5)) '' + + [[0 0 0 0] + [0 0 0 0] + [0 0 1 1] + [0 0 1 1]] + + """ + + # Class identifier. + _c_classid = 'CARRAY' + + def __init__(self, parentnode, name, + atom=None, shape=None, + title="", filters=None, + chunkshape=None, byteorder=None, + _log=True, track_times=True): + + self.atom = atom + """An `Atom` instance representing the shape, type of the atomic + objects to be saved. + """ + self.shape = None + """The shape of the stored array.""" + self.extdim = -1 # `CArray` objects are not enlargeable by default + """The index of the enlargeable dimension.""" + + # Other private attributes + self._v_version = None + """The object version of this array.""" + self._v_new = new = atom is not None + """Is this the first time the node has been created?""" + self._v_new_title = title + """New title for this node.""" + self._v_convert = True + """Whether the ``Array`` object must be converted or not.""" + self._v_chunkshape = chunkshape + """Private storage for the `chunkshape` property of the leaf.""" + + # Miscellaneous iteration rubbish. + self._start = None + """Starting row for the current iteration.""" + self._stop = None + """Stopping row for the current iteration.""" + self._step = None + """Step size for the current iteration.""" + self._nrowsread = None + """Number of rows read up to the current state of iteration.""" + self._startb = None + """Starting row for current buffer.""" + self._stopb = None + """Stopping row for current buffer. """ + self._row = None + """Current row in iterators (sentinel).""" + self._init = False + """Whether we are in the middle of an iteration or not (sentinel).""" + self.listarr = None + """Current buffer in iterators.""" + + if new: + if not isinstance(atom, Atom): + raise ValueError("atom parameter should be an instance of " + "tables.Atom and you passed a %s." % + type(atom)) + if shape is None: + raise ValueError("you must specify a non-empty shape") + try: + shape = tuple(shape) + except TypeError: + raise TypeError("`shape` parameter must be a sequence " + "and you passed a %s" % type(shape)) + self.shape = tuple(SizeType(s) for s in shape) + + if chunkshape is not None: + try: + chunkshape = tuple(chunkshape) + except TypeError: + raise TypeError( + "`chunkshape` parameter must be a sequence " + "and you passed a %s" % type(chunkshape)) + if len(shape) != len(chunkshape): + raise ValueError(f"the shape ({shape}) and chunkshape " + f"({chunkshape}) ranks must be equal.") + elif min(chunkshape) < 1: + raise ValueError("chunkshape parameter cannot have " + "zero-dimensions.") + self._v_chunkshape = tuple(SizeType(s) for s in chunkshape) + + # The `Array` class is not abstract enough! :( + super(Array, self).__init__(parentnode, name, new, filters, + byteorder, _log, track_times) + + def _g_create(self): + """Create a new array in file (specific part).""" + + if min(self.shape) < 1: + raise ValueError( + "shape parameter cannot have zero-dimensions.") + # Finish the common part of creation process + return self._g_create_common(self.nrows) + + def _g_create_common(self, expectedrows): + """Create a new array in file (common part).""" + + self._v_version = obversion + + if self._v_chunkshape is None: + # Compute the optimal chunk size + self._v_chunkshape = self._calc_chunkshape( + expectedrows, self.rowsize, self.atom.size) + # Compute the optimal nrowsinbuf + self.nrowsinbuf = self._calc_nrowsinbuf() + # Correct the byteorder if needed + if self.byteorder is None: + self.byteorder = correct_byteorder(self.atom.type, sys.byteorder) + + try: + # ``self._v_objectid`` needs to be set because would be + # needed for setting attributes in some descendants later + # on + self._v_objectid = self._create_carray(self._v_new_title) + except Exception: # XXX + # Problems creating the Array on disk. Close node and re-raise. + self.close(flush=0) + raise + + return self._v_objectid + + def _g_copy_with_stats(self, group, name, start, stop, step, + title, filters, chunkshape, _log, **kwargs): + """Private part of Leaf.copy() for each kind of leaf.""" + + (start, stop, step) = self._process_range_read(start, stop, step) + maindim = self.maindim + shape = list(self.shape) + shape[maindim] = len(range(start, stop, step)) + # Now, fill the new carray with values from source + nrowsinbuf = self.nrowsinbuf + # The slices parameter for self.__getitem__ + slices = [slice(0, dim, 1) for dim in self.shape] + # This is a hack to prevent doing unnecessary conversions + # when copying buffers + self._v_convert = False + # Build the new CArray object + object = CArray(group, name, atom=self.atom, shape=shape, + title=title, filters=filters, chunkshape=chunkshape, + _log=_log) + # Start the copy itself + for start2 in range(start, stop, step * nrowsinbuf): + # Save the records on disk + stop2 = start2 + step * nrowsinbuf + if stop2 > stop: + stop2 = stop + # Set the proper slice in the main dimension + slices[maindim] = slice(start2, stop2, step) + start3 = (start2 - start) // step + stop3 = start3 + nrowsinbuf + if stop3 > shape[maindim]: + stop3 = shape[maindim] + # The next line should be generalised if, in the future, + # maindim is designed to be different from 0 in CArrays. + # See ticket #199. + object[start3:stop3] = self.__getitem__(tuple(slices)) + # Activate the conversion again (default) + self._v_convert = True + nbytes = np.prod(self.shape, dtype=SizeType) * self.atom.size + + return (object, nbytes) diff --git a/tables/conditions.py b/tables/conditions.py new file mode 100644 index 0000000..c5fff08 --- /dev/null +++ b/tables/conditions.py @@ -0,0 +1,448 @@ +"""Utility functions and classes for supporting query conditions. + +Classes: + +`CompileCondition` + Container for a compiled condition. + +Functions: + +`compile_condition` + Compile a condition and extract usable index conditions. +`call_on_recarr` + Evaluate a function over a structured array. + +""" + +import re +import numexpr as ne + +from .utilsextension import get_nested_field +from .utils import lazyattr + + +_no_matching_opcode = re.compile(r"[^a-z]([a-z]+)_([a-z]+)[^a-z]") +# E.g. "gt" and "bfc" from "couldn't find matching opcode for 'gt_bfc'". + + +def _unsupported_operation_error(exception): + """Make the \"no matching opcode\" Numexpr `exception` more clear. + + A new exception of the same kind is returned. + + """ + + message = exception.args[0] + op, types = _no_matching_opcode.search(message).groups() + newmessage = "unsupported operand types for *%s*: " % op + newmessage += ', '.join( + ne.necompiler.typecode_to_kind[t] for t in types[1:]) + return exception.__class__(newmessage) + + +def _check_indexable_cmp(getidxcmp): + """Decorate `getidxcmp` to check the returned indexable comparison. + + This does some extra checking that Numexpr would perform later on + the comparison if it was compiled within a complete condition. + + """ + + def newfunc(exprnode, indexedcols): + result = getidxcmp(exprnode, indexedcols) + if result[0] is not None: + try: + ne.necompiler.typeCompileAst( + ne.necompiler.expressionToAST(exprnode)) + except NotImplementedError as nie: + # Try to make this Numexpr error less cryptic. + raise _unsupported_operation_error(nie) + return result + newfunc.__name__ = getidxcmp.__name__ + newfunc.__doc__ = getidxcmp.__doc__ + return newfunc + + +@_check_indexable_cmp +def _get_indexable_cmp(exprnode, indexedcols): + """Get the indexable variable-constant comparison in `exprnode`. + + A tuple of (variable, operation, constant) is returned if + `exprnode` is a variable-constant (or constant-variable) + comparison, and the variable is in `indexedcols`. A normal + variable can also be used instead of a constant: a tuple with its + name will appear instead of its value. + + Otherwise, the values in the tuple are ``None``. + """ + + not_indexable = (None, None, None) + turncmp = {'lt': 'gt', + 'le': 'ge', + 'eq': 'eq', + 'ge': 'le', + 'gt': 'lt', } + + def get_cmp(var, const, op): + var_value, const_value = var.value, const.value + if (var.astType == 'variable' and var_value in indexedcols + and const.astType in ['constant', 'variable']): + if const.astType == 'variable': + const_value = (const_value, ) + return (var_value, op, const_value) + return None + + def is_indexed_boolean(node): + return (node.astType == 'variable' + and node.astKind == 'bool' + and node.value in indexedcols) + + # Boolean variables are indexable by themselves. + if is_indexed_boolean(exprnode): + return (exprnode.value, 'eq', True) + # And so are negations of boolean variables. + if exprnode.astType == 'op' and exprnode.value == 'invert': + child = exprnode.children[0] + if is_indexed_boolean(child): + return (child.value, 'eq', False) + # A negation of an expression will be returned as ``~child``. + # The indexability of the negated expression will be decided later on. + if child.astKind == "bool": + return (child, 'invert', None) + + # Check node type. Only comparisons are indexable from now on. + if exprnode.astType != 'op': + return not_indexable + cmpop = exprnode.value + if cmpop not in turncmp: + return not_indexable + + # Look for a variable-constant comparison in both directions. + left, right = exprnode.children + cmp_ = get_cmp(left, right, cmpop) + if cmp_: + return cmp_ + cmp_ = get_cmp(right, left, turncmp[cmpop]) + if cmp_: + return cmp_ + + return not_indexable + + +def _equiv_expr_node(x, y): + """Returns whether two ExpressionNodes are equivalent. + + This is needed because '==' is overridden on ExpressionNode to + return a new ExpressionNode. + + """ + if (not isinstance(x, ne.expressions.ExpressionNode) + and not isinstance(y, ne.expressions.ExpressionNode)): + return x == y + elif (type(x) is not type(y) + or not isinstance(x, ne.expressions.ExpressionNode) + or not isinstance(y, ne.expressions.ExpressionNode) + or x.value != y.value + or x.astKind != y.astKind + or len(x.children) != len(y.children)): + return False + for xchild, ychild in zip(x.children, y.children): + if not _equiv_expr_node(xchild, ychild): + return False + return True + + +def _get_idx_expr_recurse(exprnode, indexedcols, idxexprs, strexpr): + """Here lives the actual implementation of the get_idx_expr() wrapper. + + 'idxexprs' is a list of expressions in the form ``(var, (ops), + (limits))``. 'strexpr' is the indexable expression in string format. + These parameters will be received empty (i.e. [], ['']) for the + first time and populated during the different recursive calls. + Finally, they are returned in the last level to the original + wrapper. If 'exprnode' is not indexable, it will return the tuple + ([], ['']) so as to signal this. + + """ + + not_indexable = ([], ['']) + op_conv = { + 'and': '&', + 'or': '|', + 'not': '~', + } + negcmp = { + 'lt': 'ge', + 'le': 'gt', + 'ge': 'lt', + 'gt': 'le', + } + + def fix_invert(idxcmp, exprnode, indexedcols): + invert = False + # Loop until all leading negations have been dealt with + while idxcmp[1] == "invert": + invert ^= True + # The information about the negated node is in first position + exprnode = idxcmp[0] + idxcmp = _get_indexable_cmp(exprnode, indexedcols) + return idxcmp, exprnode, invert + + # Indexable variable-constant comparison. + idxcmp = _get_indexable_cmp(exprnode, indexedcols) + idxcmp, exprnode, invert = fix_invert(idxcmp, exprnode, indexedcols) + if idxcmp[0]: + if invert: + var, op, value = idxcmp + if op == 'eq' and value in [True, False]: + # ``var`` must be a boolean index. Flip its value. + value ^= True + else: + op = negcmp[op] + expr = (var, (op,), (value,)) + invert = False + else: + expr = (idxcmp[0], (idxcmp[1],), (idxcmp[2],)) + return [expr] + + # For now negations of complex expressions will be not supported as + # forming part of an indexable condition. This might be supported in + # the future. + if invert: + return not_indexable + + # Only conjunctions and disjunctions of comparisons are considered + # for the moment. + if exprnode.astType != 'op' or exprnode.value not in ['and', 'or']: + return not_indexable + + left, right = exprnode.children + # Get the expression at left + lcolvar, lop, llim = _get_indexable_cmp(left, indexedcols) + # Get the expression at right + rcolvar, rop, rlim = _get_indexable_cmp(right, indexedcols) + + # Use conjunction of indexable VC comparisons like + # ``(a <[=] x) & (x <[=] b)`` or ``(a >[=] x) & (x >[=] b)`` + # as ``a <[=] x <[=] b``, for the moment. + op = exprnode.value + if (lcolvar is not None and rcolvar is not None + and _equiv_expr_node(lcolvar, rcolvar) and op == 'and'): + if lop in ['gt', 'ge'] and rop in ['lt', 'le']: # l <= x <= r + expr = (lcolvar, (lop, rop), (llim, rlim)) + return [expr] + if lop in ['lt', 'le'] and rop in ['gt', 'ge']: # l >= x >= r + expr = (rcolvar, (rop, lop), (rlim, llim)) + return [expr] + + # Recursively get the expressions at the left and the right + lexpr = _get_idx_expr_recurse(left, indexedcols, idxexprs, strexpr) + rexpr = _get_idx_expr_recurse(right, indexedcols, idxexprs, strexpr) + + def add_expr(expr, idxexprs, strexpr): + """Add a single expression to the list.""" + + if isinstance(expr, list): + # expr is a single expression + idxexprs.append(expr[0]) + lenexprs = len(idxexprs) + # Mutate the strexpr string + if lenexprs == 1: + strexpr[:] = ["e0"] + else: + strexpr[:] = [ + "(%s %s e%d)" % (strexpr[0], op_conv[op], lenexprs - 1)] + + # Add expressions to the indexable list when they are and'ed, or + # they are both indexable. + if lexpr != not_indexable and (op == "and" or rexpr != not_indexable): + add_expr(lexpr, idxexprs, strexpr) + if rexpr != not_indexable: + add_expr(rexpr, idxexprs, strexpr) + return (idxexprs, strexpr) + if rexpr != not_indexable and op == "and": + add_expr(rexpr, idxexprs, strexpr) + return (idxexprs, strexpr) + + # Can not use indexed column. + return not_indexable + + +def _get_idx_expr(expr, indexedcols): + """Extract an indexable expression out of `exprnode`. + + Looks for variable-constant comparisons in the expression node + `exprnode` involving variables in `indexedcols`. + + It returns a tuple of (idxexprs, strexpr) where 'idxexprs' is a + list of expressions in the form ``(var, (ops), (limits))`` and + 'strexpr' is the indexable expression in string format. + + Expressions such as ``0 < c1 <= 1`` do not work as expected. + + Right now only some of the *indexable comparisons* are considered: + + * ``a <[=] x``, ``a == x`` and ``a >[=] x`` + * ``(a <[=] x) & (y <[=] b)`` and ``(a == x) | (b == y)`` + * ``~(~c_bool)``, ``~~c_bool`` and ``~(~c_bool) & (c_extra != 2)`` + + (where ``a``, ``b`` and ``c_bool`` are indexed columns, but + ``c_extra`` is not) + + Particularly, the ``!=`` operator and negations of complex boolean + expressions are *not considered* as valid candidates: + + * ``a != 1`` and ``c_bool != False`` + * ``~((a > 0) & (c_bool))`` + + """ + + return _get_idx_expr_recurse(expr, indexedcols, [], ['']) + + +class CompiledCondition: + """Container for a compiled condition.""" + + @lazyattr + def index_variables(self): + """The columns participating in the index expression.""" + + idxexprs = self.index_expressions + idxvars = [] + for expr in idxexprs: + idxvar = expr[0] + if idxvar not in idxvars: + idxvars.append(idxvar) + return frozenset(idxvars) + + def __init__(self, func, params, idxexprs, strexpr, **kwargs): + self.function = func + """The compiled function object corresponding to this condition.""" + self.parameters = params + """A list of parameter names for this condition.""" + self.index_expressions = idxexprs + """A list of expressions in the form ``(var, (ops), (limits))``.""" + self.string_expression = strexpr + """The indexable expression in string format.""" + self.kwargs = kwargs + """NumExpr kwargs (used to pass ex_uses_vml to numexpr)""" + + def __repr__(self): + return ("idxexprs: %s\nstrexpr: %s\nidxvars: %s" + % (self.index_expressions, self.string_expression, + self.index_variables)) + + def with_replaced_vars(self, condvars): + """Replace index limit variables with their values in-place. + + A new compiled condition is returned. Values are taken from + the `condvars` mapping and converted to Python scalars. + """ + + exprs = self.index_expressions + exprs2 = [] + for expr in exprs: + idxlims = expr[2] # the limits are in third place + limit_values = [] + for idxlim in idxlims: + if isinstance(idxlim, tuple): # variable + idxlim = condvars[idxlim[0]] # look up value + idxlim = idxlim.tolist() # convert back to Python + limit_values.append(idxlim) + # Add this replaced entry to the new exprs2 + var, ops, _ = expr + exprs2.append((var, ops, tuple(limit_values))) + # Create a new container for the converted values + newcc = CompiledCondition( + self.function, self.parameters, exprs2, self.string_expression, + **self.kwargs) + return newcc + + +def _get_variable_names(expression): + """Return the list of variable names in the Numexpr `expression`.""" + + names = [] + stack = [expression] + while stack: + node = stack.pop() + if node.astType == 'variable': + names.append(node.value) + elif hasattr(node, 'children'): + stack.extend(node.children) + return list(set(names)) # remove repeated names + + +def compile_condition(condition, typemap, indexedcols): + """Compile a condition and extract usable index conditions. + + Looks for variable-constant comparisons in the `condition` string + involving the indexed columns whose variable names appear in + `indexedcols`. The part of `condition` having usable indexes is + returned as a compiled condition in a `CompiledCondition` container. + + Expressions such as '0 < c1 <= 1' do not work as expected. The + Numexpr types of *all* variables must be given in the `typemap` + mapping. The ``function`` of the resulting `CompiledCondition` + instance is a Numexpr function object, and the ``parameters`` list + indicates the order of its parameters. + + """ + + # Get the expression tree and extract index conditions. + expr = ne.necompiler.stringToExpression(condition, typemap, {}) + if expr.astKind != 'bool': + raise TypeError("condition ``%s`` does not have a boolean type" + % condition) + idxexprs = _get_idx_expr(expr, indexedcols) + # Post-process the answer + if isinstance(idxexprs, list): + # Simple expression + strexpr = ['e0'] + else: + # Complex expression + idxexprs, strexpr = idxexprs + # Get rid of the unneccessary list wrapper for strexpr + strexpr = strexpr[0] + + # Get the variable names used in the condition. + # At the same time, build its signature. + varnames = _get_variable_names(expr) + signature = [(var, typemap[var]) for var in varnames] + try: + # See the comments in `numexpr.evaluate()` for the + # reasons of inserting copy operators for unaligned, + # *unidimensional* arrays. + func = ne.necompiler.NumExpr(expr, signature) + except NotImplementedError as nie: + # Try to make this Numexpr error less cryptic. + raise _unsupported_operation_error(nie) + + _, ex_uses_vml = ne.necompiler.getExprNames(condition, {}) + kwargs = {'ex_uses_vml': ex_uses_vml} + + params = varnames + # This is more comfortable to handle about than a tuple. + return CompiledCondition(func, params, idxexprs, strexpr, **kwargs) + + +def call_on_recarr(func, params, recarr, param2arg=None, **kwargs): + """Call `func` with `params` over `recarr`. + + The `param2arg` function, when specified, is used to get an argument + given a parameter name; otherwise, the parameter itself is used as + an argument. When the argument is a `Column` object, the proper + column from `recarr` is used as its value. + + """ + + args = [] + for param in params: + if param2arg: + arg = param2arg(param) + else: + arg = param + if hasattr(arg, 'pathname'): # looks like a column + arg = get_nested_field(recarr, arg.pathname) + args.append(arg) + return func(*args, **kwargs) diff --git a/tables/definitions.pxd b/tables/definitions.pxd new file mode 100644 index 0000000..bc51c3d --- /dev/null +++ b/tables/definitions.pxd @@ -0,0 +1,569 @@ +######################################################################## +# +# License: BSD +# Created: June 20, 2005 +# Author: Francesc Alted - faltet@pytables.com +# +# $Id: definitions.pyd 1018 2005-06-20 09:43:34Z faltet $ +# +######################################################################## + +"""Here are some definitions for sharing between extensions.""" + +import sys + +cdef extern from *: + ctypedef long uintptr_t + +# Standard C functions. +cdef extern from "time.h": + ctypedef int time_t + +from libc.stdio cimport FILE + + +#----------------------------------------------------------------------------- + +# API for NumPy objects +from numpy cimport dtype +cdef extern from "numpy/arrayobject.h": + object PyArray_Scalar(void *data, dtype descr, object itemsize) + + +#----------------------------------------------------------------------------- + + +# Structs and types from HDF5 +cdef extern from "hdf5.h" nogil: + + ctypedef long long hid_t # In H5Ipublic.h + ctypedef int hbool_t + ctypedef int herr_t + ctypedef int htri_t + # hsize_t should be unsigned, but Windows platform does not support + # such an unsigned long long type. + ctypedef unsigned long long hsize_t + ctypedef signed long long hssize_t + ctypedef long long int64_t + ctypedef unsigned long long haddr_t + ctypedef haddr_t hobj_ref_t + + ctypedef struct hvl_t: + size_t len # Length of VL data (in base type units) + void *p # Pointer to VL data + + int H5F_ACC_TRUNC, H5F_ACC_RDONLY, H5F_ACC_RDWR, H5F_ACC_EXCL + int H5F_ACC_DEBUG, H5F_ACC_CREAT + int H5P_DEFAULT, H5P_DATASET_XFER, H5S_ALL + int H5P_FILE_CREATE, H5P_FILE_ACCESS + int H5FD_LOG_LOC_WRITE, H5FD_LOG_ALL + int H5I_INVALID_HID + int H5E_DEFAULT + int H5T_STD_REF_OBJ + int H5R_OBJ_REF_BUF_SIZE + + # Library types + cdef enum H5I_type_t: + H5I_UNINIT = -2 # uninitialized type + H5I_BADID = -1 # invalid Type + H5I_FILE = 1 # File objects + H5I_GROUP = 0 # Group objects + H5I_DATATYPE = 1 # Datatype objects + H5I_DATASPACE = 2 # Dataspace objects + H5I_DATASET = 3 # Dataset objects + H5I_ATTR = 4 # Attribute objects + H5I_REFERENCE = 5 # Reference objects + H5I_VFL = 6 # virtual file layer + H5I_GENPROP_CLS = 7 # generic property list classes + H5I_GENPROP_LST = 8 # generic property lists + H5I_ERROR_CLASS = 9 # error classes + H5I_ERROR_MSG = 10 # error messages + H5I_ERROR_STACK = 11 # error stacks + H5I_NTYPES # Sentinel value - must be last + + # Reference types + cdef enum H5R_type_t: + H5R_BADTYPE = -1 # Invalid Reference Type + H5R_OBJECT = 0 # Object reference + H5R_DATASET_REGION = 1 # Dataset Region Reference + H5R_MAXTYPE # Sentinel value - must be last + + # The difference between a single file and a set of mounted files + cdef enum H5F_scope_t: + H5F_SCOPE_LOCAL = 0 # specified file handle only + H5F_SCOPE_GLOBAL = 1 # entire virtual file + H5F_SCOPE_DOWN = 2 # for internal use only + + cdef enum H5FD_mem_t: + H5FD_MEM_NOLIST = -1, # Data should not appear in the free list. + # Must be negative. + H5FD_MEM_DEFAULT = 0, # Value not yet set. Can also be the + # datatype set in a larger allocation + # that will be suballocated by the library. + # Must be zero. + H5FD_MEM_SUPER = 1, # Superblock data + H5FD_MEM_BTREE = 2, # B-tree data + H5FD_MEM_DRAW = 3, # Raw data (content of datasets, etc.) + H5FD_MEM_GHEAP = 4, # Global heap data + H5FD_MEM_LHEAP = 5, # Local heap data + H5FD_MEM_OHDR = 6, # Object header data + H5FD_MEM_NTYPES # Sentinel value - must be last + + cdef enum H5O_type_t: + H5O_TYPE_UNKNOWN = -1 # Unknown object type + H5O_TYPE_GROUP # Object is a group + H5O_TYPE_DATASET # Object is a dataset + H5O_TYPE_NAMED_DATATYPE # Object is a named data type + + cdef enum H5L_type_t: + H5L_TYPE_ERROR = -1 # Invalid link type id + H5L_TYPE_HARD = 0 # Hard link id + H5L_TYPE_SOFT = 1 # Soft link id + H5L_TYPE_EXTERNAL = 64, # External link id + + # Values for fill value status + cdef enum H5D_fill_value_t: + H5D_FILL_VALUE_ERROR = -1 + H5D_FILL_VALUE_UNDEFINED = 0 + H5D_FILL_VALUE_DEFAULT = 1 + H5D_FILL_VALUE_USER_DEFINED = 2 + + # HDF5 layouts + cdef enum H5D_layout_t: + H5D_LAYOUT_ERROR = -1 + H5D_COMPACT = 0 # raw data is very small + H5D_CONTIGUOUS = 1 # the default + H5D_CHUNKED = 2 # slow and fancy + H5D_NLAYOUTS = 3 # this one must be last! + + # Byte orders + cdef enum H5T_order_t: + H5T_ORDER_ERROR = -1 # error + H5T_ORDER_LE = 0 # little endian + H5T_ORDER_BE = 1 # bit endian + H5T_ORDER_VAX = 2 # VAX mixed endian + H5T_ORDER_NONE = 3 # no particular order (strings, bits,..) + + # HDF5 signed enums + cdef enum H5T_sign_t: + H5T_SGN_ERROR = -1 # error + H5T_SGN_NONE = 0 # this is an unsigned type + H5T_SGN_2 = 1 # two's complement + H5T_NSGN = 2 # this must be last! + + # HDF5 type classes + cdef enum H5T_class_t: + H5T_NO_CLASS = -1 # error + H5T_INTEGER = 0 # integer types + H5T_FLOAT = 1 # floating-point types + H5T_TIME = 2 # date and time types + H5T_STRING = 3 # character string types + H5T_BITFIELD = 4 # bit field types + H5T_OPAQUE = 5 # opaque types + H5T_COMPOUND = 6 # compound types + H5T_REFERENCE = 7 # reference types + H5T_ENUM = 8 # enumeration types + H5T_VLEN = 9 # variable-length types + H5T_ARRAY = 10 # array types + H5T_NCLASSES # this must be last + + # Native types + hid_t H5T_C_S1 + hid_t H5T_NATIVE_B8 + hid_t H5T_NATIVE_CHAR + hid_t H5T_NATIVE_SCHAR + hid_t H5T_NATIVE_UCHAR + hid_t H5T_NATIVE_SHORT + hid_t H5T_NATIVE_USHORT + hid_t H5T_NATIVE_INT + hid_t H5T_NATIVE_UINT + hid_t H5T_NATIVE_LONG + hid_t H5T_NATIVE_ULONG + hid_t H5T_NATIVE_LLONG + hid_t H5T_NATIVE_ULLONG + hid_t H5T_NATIVE_FLOAT + hid_t H5T_NATIVE_DOUBLE + hid_t H5T_NATIVE_LDOUBLE + + # "Standard" types + hid_t H5T_STD_I8LE + hid_t H5T_STD_I16LE + hid_t H5T_STD_I32LE + hid_t H5T_STD_I64LE + hid_t H5T_STD_U8LE + hid_t H5T_STD_U16LE + hid_t H5T_STD_U32LE + hid_t H5T_STD_U64LE + hid_t H5T_STD_B8LE + hid_t H5T_STD_B16LE + hid_t H5T_STD_B32LE + hid_t H5T_STD_B64LE + hid_t H5T_IEEE_F32LE + hid_t H5T_IEEE_F64LE + hid_t H5T_STD_I8BE + hid_t H5T_STD_I16BE + hid_t H5T_STD_I32BE + hid_t H5T_STD_I64BE + hid_t H5T_STD_U8BE + hid_t H5T_STD_U16BE + hid_t H5T_STD_U32BE + hid_t H5T_STD_U64BE + hid_t H5T_STD_B8BE + hid_t H5T_STD_B16BE + hid_t H5T_STD_B32BE + hid_t H5T_STD_B64BE + hid_t H5T_IEEE_F32BE + hid_t H5T_IEEE_F64BE + + # Types which are particular to UNIX (for Time types) + hid_t H5T_UNIX_D32LE + hid_t H5T_UNIX_D64LE + hid_t H5T_UNIX_D32BE + hid_t H5T_UNIX_D64BE + + # The order to retrieve atomic native datatype + cdef enum H5T_direction_t: + H5T_DIR_DEFAULT = 0 # default direction is inscendent + H5T_DIR_ASCEND = 1 # in inscendent order + H5T_DIR_DESCEND = 2 # in descendent order + + # Codes for defining selections + cdef enum H5S_seloper_t: + H5S_SELECT_NOOP = -1 + H5S_SELECT_SET = 0 + H5S_SELECT_OR + H5S_SELECT_AND + H5S_SELECT_XOR + H5S_SELECT_NOTB + H5S_SELECT_NOTA + H5S_SELECT_APPEND + H5S_SELECT_PREPEND + H5S_SELECT_INVALID # Must be the last one + + # Character set to use for text strings + cdef enum H5T_cset_t: + H5T_CSET_ERROR = -1 # error + H5T_CSET_ASCII = 0 # US ASCII + H5T_CSET_UTF8 = 1 # UTF-8 Unicode encoding + H5T_CSET_RESERVED_2 = 2 + H5T_CSET_RESERVED_3 = 3 + H5T_CSET_RESERVED_4 = 4 + H5T_CSET_RESERVED_5 = 5 + H5T_CSET_RESERVED_6 = 6 + H5T_CSET_RESERVED_7 = 7 + H5T_CSET_RESERVED_8 = 8 + H5T_CSET_RESERVED_9 = 9 + H5T_CSET_RESERVED_10 = 10 + H5T_CSET_RESERVED_11 = 11 + H5T_CSET_RESERVED_12 = 12 + H5T_CSET_RESERVED_13 = 13 + H5T_CSET_RESERVED_14 = 14 + H5T_CSET_RESERVED_15 = 15 + + # Error stack traversal direction + cdef enum H5E_direction_t: + H5E_WALK_UPWARD = 0 # begin deep, end at API function + H5E_WALK_DOWNWARD = 1 # begin at API function, end deep + + cdef enum H5E_type_t: + H5E_MAJOR + H5E_MINOR + + ctypedef struct H5E_error_t: + hid_t cls_id # class ID + hid_t maj_num # major error ID + hid_t min_num # minor error number + unsigned line # line in file where error occurs + const char *func_name # function in which error occurred + const char *file_name # file in which error occurred + const char *desc # optional supplied description + + ctypedef herr_t (*H5E_walk_t)(unsigned n, H5E_error_t *err, void *data) + ctypedef herr_t (*H5E_auto_t)(hid_t estack, void *data) + + # object info + ctypedef struct H5O_info_t: + unsigned long fileno # Number of file where object is located + haddr_t addr # Object address in file + H5O_type_t type # Basic object type + unsigned rc # Reference count of object + time_t atime # Access time + time_t mtime # Modification time + time_t ctime # Change time + time_t btime # Birth time + hsize_t num_attrs # number of attributes attached to object + #H5O_hdr_info_t hdr # Object header information + #struct { + # H5_ih_info_t obj + # H5_ih_info_t attr + #} meta_size + + + #------------------------------------------------------------------ + + # HDF5 API + + # Version functions + herr_t H5get_libversion(unsigned *majnum, unsigned *minnum, + unsigned *relnum ) + herr_t H5check_version(unsigned majnum, unsigned minnum, + unsigned relnum ) + + # misc + #herr_t H5free_memory(void *buf) # new in HDF5 1.8.13 + + # Operations with files + hid_t H5Fcreate(char *filename, unsigned int flags, + hid_t create_plist, hid_t access_plist) + hid_t H5Fopen(char *name, unsigned flags, hid_t access_id) + herr_t H5Fclose (hid_t file_id) + htri_t H5Fis_hdf5(char *name) + herr_t H5Fflush(hid_t object_id, H5F_scope_t scope) + herr_t H5Fget_vfd_handle(hid_t file_id, hid_t fapl_id, void **file_handle) + ssize_t H5Fget_file_image(hid_t file_id, void *buf_ptr, size_t buf_len) + herr_t H5Fget_filesize(hid_t file_id, hsize_t *size) + hid_t H5Fget_create_plist(hid_t file_id) + + # Operations with groups + hid_t H5Gcreate(hid_t loc_id, char *name, hid_t lcpl_id, hid_t gcpl_id, + hid_t gapl_id) + hid_t H5Gopen(hid_t loc_id, char *name, hid_t gapl_id) + herr_t H5Gclose(hid_t group_id) + + # Operations with links + herr_t H5Ldelete(hid_t file_id, char *name, hid_t lapl_id) + herr_t H5Lmove(hid_t src_loc_id, char *src_name, + hid_t dst_loc_id, char *dst_name, hid_t lcpl, hid_t lap) + + # For dealing with datasets + hid_t H5Dopen(hid_t file_id, char *name, hid_t dapl_id) + herr_t H5Dclose(hid_t dset_id) + herr_t H5Dread(hid_t dset_id, hid_t mem_type_id, hid_t mem_space_id, + hid_t file_space_id, hid_t plist_id, void *buf) + herr_t H5Dwrite(hid_t dset_id, hid_t mem_type_id, hid_t mem_space_id, + hid_t file_space_id, hid_t plist_id, void *buf) + hid_t H5Dget_type(hid_t dset_id) + hid_t H5Dget_space(hid_t dset_id) + herr_t H5Dvlen_reclaim(hid_t type_id, hid_t space_id, hid_t plist_id, + void *buf) + hid_t H5Dget_create_plist(hid_t dataset_id) + hsize_t H5Dget_storage_size(hid_t dataset_id) + herr_t H5Dvlen_get_buf_size(hid_t dataset_id, hid_t type_id, hid_t space_id, + hsize_t *size) + + # Functions for dealing with dataspaces + hid_t H5Screate_simple(int rank, hsize_t dims[], hsize_t maxdims[]) + int H5Sget_simple_extent_ndims(hid_t space_id) + int H5Sget_simple_extent_dims(hid_t space_id, hsize_t dims[], + hsize_t maxdims[]) + herr_t H5Sselect_all(hid_t spaceid) + herr_t H5Sselect_hyperslab(hid_t space_id, H5S_seloper_t op, + hsize_t start[], hsize_t _stride[], + hsize_t count[], hsize_t _block[]) + herr_t H5Sselect_elements(hid_t space_id, H5S_seloper_t op, + size_t num_elements, hsize_t *coord) + herr_t H5Sclose(hid_t space_id) + + + # Functions for dealing with datatypes + H5T_class_t H5Tget_class(hid_t type_id) + hid_t H5Tget_super(hid_t type) + H5T_sign_t H5Tget_sign(hid_t type_id) + H5T_order_t H5Tget_order(hid_t type_id) + size_t H5Tget_size(hid_t type_id) + herr_t H5Tset_size(hid_t type_id, size_t size) + size_t H5Tget_precision(hid_t dtype_id) + herr_t H5Tset_precision(hid_t type_id, size_t prec) + hid_t H5Tcreate(H5T_class_t type, size_t size) + hid_t H5Tvlen_create(hid_t base_type_id) + hid_t H5Tcopy(hid_t type_id) + herr_t H5Tclose(hid_t type_id) + htri_t H5Tequal(hid_t dtype_id1, hid_t dtype_id2) + + # Operations defined on string data types + htri_t H5Tis_variable_str(hid_t dtype_id) + + # Operations for compound data types + int H5Tget_nmembers(hid_t type_id) + char *H5Tget_member_name(hid_t type_id, unsigned membno) + hid_t H5Tget_member_type(hid_t type_id, unsigned membno) + hid_t H5Tget_native_type(hid_t type_id, H5T_direction_t direction) + herr_t H5Tget_member_value(hid_t type_id, int membno, void *value) + size_t H5Tget_member_offset(hid_t type_id, unsigned memb_no) + int H5Tget_offset(hid_t type_id) + herr_t H5Tinsert(hid_t parent_id, char *name, size_t offset, + hid_t member_id) + herr_t H5Tpack(hid_t type_id) + + # Operations for enumerated data types + hid_t H5Tenum_create(hid_t base_id) + herr_t H5Tenum_insert(hid_t type, char *name, void *value) + + # Operations for array data types + hid_t H5Tarray_create(hid_t base_id, int ndims, hsize_t dims[]) + int H5Tget_array_ndims(hid_t type_id) + int H5Tget_array_dims(hid_t type_id, hsize_t dims[]) + + # Operations with attributes + herr_t H5Adelete(hid_t loc_id, char *name) + int H5Aget_num_attrs(hid_t loc_id) + size_t H5Aget_name(hid_t attr_id, size_t buf_size, char *buf) + hid_t H5Aopen_idx(hid_t loc_id, unsigned int idx) + herr_t H5Aread(hid_t attr_id, hid_t mem_type_id, void *buf) + herr_t H5Aclose(hid_t attr_id) + + # Operations with properties + hid_t H5Pcreate(hid_t plist_id) + herr_t H5Pclose(hid_t plist_id) + herr_t H5Pset_cache(hid_t plist_id, int mdc_nelmts, int rdcc_nelmts, + size_t rdcc_nbytes, double rdcc_w0) + herr_t H5Pset_sieve_buf_size(hid_t fapl_id, hsize_t size) + H5D_layout_t H5Pget_layout(hid_t plist) + int H5Pget_chunk(hid_t plist, int max_ndims, hsize_t *dims) + + hid_t H5Pget_driver(hid_t plist_id) + herr_t H5Pset_fapl_sec2(hid_t fapl_id) + #herr_t H5Pget_fapl_direct(hid_t fapl_id, size_t *alignment, + # size_t *block_size, size_t *cbuf_size) + #herr_t H5Pset_fapl_direct(hid_t fapl_id, size_t alignment, + # size_t block_size, size_t cbuf_size) + herr_t H5Pset_fapl_log(hid_t fapl_id, const char *logfile, + unsigned long long flags, size_t buf_size) + #herr_t H5Pset_fapl_windows(hid_t fapl_id) + herr_t H5Pset_fapl_stdio(hid_t fapl_id) + #herr_t H5Pget_fapl_core(hid_t fapl_id, size_t *increment, + # hbool_t *backing_store) + herr_t H5Pset_fapl_core(hid_t fapl_id, size_t increment, + hbool_t backing_store) + #herr_t H5Pget_fapl_family(hid_t fapl_id, hsize_t *memb_size, + # hid_t *memb_fapl_id) + herr_t H5Pset_fapl_family(hid_t fapl_id, hsize_t memb_size, + hid_t memb_fapl_id) + #herr_t H5Pget_fapl_multi(hid_t fapl_id, H5FD_mem_t *memb_map, + # hid_t *memb_fapl, const char **memb_name, + # haddr_t *memb_addr, hbool_t *relax) + herr_t H5Pset_fapl_multi(hid_t fapl_id, H5FD_mem_t *memb_map, + hid_t *memb_fapl, char **memb_name, + haddr_t *memb_addr, hbool_t relax) + herr_t H5Pset_fapl_split(hid_t fapl_id, char *meta_ext, + hid_t meta_plist_id, char *raw_ext, + hid_t raw_plist_id) + #herr_t H5Pget_fapl_mpio(hid_t fapl_id, MPI_Comm *comm, MPI_Info *info) + #herr_t H5Pset_fapl_mpio(hid_t fapl_id, MPI_Comm comm, MPI_Info info) + + #herr_t H5Pget_fapl_mpiposix(hid_t fapl_id, MPI_Comm *comm, + # hbool_t *use_gpfs_hints) + #herr_t H5Pset_fapl_mpiposix(hid_t fapl_id, MPI_Comm comm, + # hbool_t use_gpfs_hints) + herr_t H5Pset_file_image(hid_t fapl_id, void *buf_ptr, size_t buf_len) + herr_t H5Pget_userblock(hid_t plist, hsize_t *size) + herr_t H5Pset_userblock(hid_t plist, hsize_t size) + herr_t H5Pget_obj_track_times(hid_t ocpl_id, hbool_t *track_times) + + # Error Handling Interface + #herr_t H5Eget_auto(hid_t estack_id, H5E_auto_t *func, void** data) + herr_t H5Eset_auto(hid_t estack_id, H5E_auto_t func, void *data) + herr_t H5Eprint(hid_t estack_id, FILE *stream) + herr_t H5Ewalk(hid_t estack_id, H5E_direction_t dir, H5E_walk_t func, + void *data) + #hid_t H5Eget_current_stack(void) + #herr_t H5Eclose_stack(hid_t estack_id) + #ssize_t H5Eget_num(hid_t estack_id) + ssize_t H5Eget_msg(hid_t mesg_id, H5E_type_t* mesg_type, char* mesg, + size_t size) + #herr_t H5Eclose_msg(hid_t mesg_id) + #ssize_t H5Eget_class_name(hid_t class_id, char* name, size_t size) + + # Onject interface + herr_t H5Oget_info(hid_t object_id, H5O_info_t *object_info) + + # Operations with filters and compression interface + ctypedef int H5Z_filter_t + + #herr_t H5Zregister(const void *cls) + herr_t H5Zunregister(H5Z_filter_t id) + #htri_t H5Zfilter_avail(H5Z_filter_t id) + #herr_t H5Zget_filter_info(H5Z_filter_t, unsigned int*) + + # Operations on the references + H5I_type_t H5Iget_type(hid_t id) + herr_t H5Rcreate(void *reference, hid_t loc_id, const char *name, H5R_type_t type, hid_t space_id) + hid_t H5Rdereference(hid_t dset, H5R_type_t rtype, void *reference) + herr_t H5Oclose( hid_t object_id ) + + +# Specific HDF5 functions for PyTables +cdef extern from "H5ATTR.h" nogil: + herr_t H5ATTRget_attribute(hid_t loc_id, char *attr_name, + hid_t type_id, void *data) + hsize_t H5ATTRget_attribute_string(hid_t loc_id, char *attr_name, + char **attr_value, int *cset) + hsize_t H5ATTRget_attribute_vlen_string_array(hid_t loc_id, char *attr_name, + char ***attr_value, int *cset) + herr_t H5ATTRset_attribute(hid_t obj_id, char *attr_name, + hid_t type_id, size_t rank, hsize_t *dims, + char *attr_data) + herr_t H5ATTRset_attribute_string(hid_t loc_id, char *attr_name, + char *attr_data, hsize_t attr_size, + int cset) + herr_t H5ATTRfind_attribute(hid_t loc_id, char *attr_name) + herr_t H5ATTRget_type_ndims(hid_t loc_id, char *attr_name, + hid_t *type_id, H5T_class_t *class_id, + size_t *type_size, int *rank) + herr_t H5ATTRget_dims(hid_t loc_id, char *attr_name, hsize_t *dims) + + +# Functions for operations with ARRAY +cdef extern from "H5ARRAY.h" nogil: + herr_t H5ARRAYget_ndims(hid_t dataset_id, int *rank) + herr_t H5ARRAYget_info(hid_t dataset_id, hid_t type_id, hsize_t *dims, + hsize_t *maxdims, H5T_class_t *super_class_id, + char *byteorder) + + +# Some utilities +cdef extern from "utils.h" nogil: + herr_t set_cache_size(hid_t file_id, size_t cache_size) + int get_objinfo(hid_t loc_id, char *name) + int get_linkinfo(hid_t loc_id, char *name) + hsize_t get_len_of_range(hsize_t lo, hsize_t hi, hsize_t step) + hid_t create_ieee_float16(char *byteorder) + hid_t create_ieee_complex64(char *byteorder) + hid_t create_ieee_complex128(char *byteorder) + hid_t create_ieee_complex192(char *byteorder) + hid_t create_ieee_complex256(char *byteorder) + herr_t set_order(hid_t type_id, char *byteorder) + herr_t get_order(hid_t type_id, char *byteorder) + int is_complex(hid_t type_id) + herr_t truncate_dset(hid_t dataset_id, int maindim, hsize_t size) + + # compatibility + herr_t pt_H5Pset_fapl_direct(hid_t fapl_id, size_t alignment, + size_t block_size, size_t cbuf_size) + herr_t pt_H5Pset_fapl_windows(hid_t fapl_id) + herr_t pt_H5Pset_file_image(hid_t fapl_id, void *buf_ptr, size_t buf_len) + ssize_t pt_H5Fget_file_image(hid_t file_id, void *buf_ptr, size_t buf_len) + herr_t pt_H5free_memory(void *buf) + + int H5_HAVE_DIRECT_DRIVER, H5_HAVE_WINDOWS_DRIVER, H5_HAVE_IMAGE_FILE + + +cdef extern from "utils.h": + object Giterate(hid_t parent_id, hid_t loc_id, char *name) + object Aiterate(hid_t loc_id) + object H5UIget_info(hid_t loc_id, char *name, char *byteorder) + + +# Type conversion routines +cdef extern from "typeconv.h" nogil: + void conv_float64_timeval32(void *base, + unsigned long byteoffset, + unsigned long bytestride, + long long nrecords, + unsigned long nelements, + int sense) + +# Blosc registration +cdef extern from "blosc_filter.h" nogil: + int register_blosc(char **version, char **date) + int FILTER_BLOSC diff --git a/tables/description.py b/tables/description.py new file mode 100644 index 0000000..bd90599 --- /dev/null +++ b/tables/description.py @@ -0,0 +1,972 @@ +"""Classes for describing columns for ``Table`` objects.""" + +import copy +import warnings + +import numpy as np + +from . import atom +from .path import check_name_validity + + +__docformat__ = 'reStructuredText' +"""The format of documentation strings in this module.""" + + +def same_position(oldmethod): + """Decorate `oldmethod` to also compare the `_v_pos` attribute.""" + def newmethod(self, other): + try: + other._v_pos + except AttributeError: + return False # not a column definition + return self._v_pos == other._v_pos and oldmethod(self, other) + newmethod.__name__ = oldmethod.__name__ + newmethod.__doc__ = oldmethod.__doc__ + return newmethod + + +class Col(atom.Atom, metaclass=type): + """Defines a non-nested column. + + Col instances are used as a means to declare the different properties of a + non-nested column in a table or nested column. Col classes are descendants + of their equivalent Atom classes (see :ref:`AtomClassDescr`), but their + instances have an additional _v_pos attribute that is used to decide the + position of the column inside its parent table or nested column (see the + IsDescription class in :ref:`IsDescriptionClassDescr` for more information + on column positions). + + In the same fashion as Atom, you should use a particular Col descendant + class whenever you know the exact type you will need when writing your + code. Otherwise, you may use one of the Col.from_*() factory methods. + + Each factory method inherited from the Atom class is available with the + same signature, plus an additional pos parameter (placed in last position) + which defaults to None and that may take an integer value. This parameter + might be used to specify the position of the column in the table. + + Besides, there are the next additional factory methods, available only for + Col objects. + + The following parameters are available for most Col-derived constructors. + + Parameters + ---------- + itemsize : int + For types with a non-fixed size, this sets the size in bytes of + individual items in the column. + shape : tuple + Sets the shape of the column. An integer shape of N is equivalent to + the tuple (N,). + dflt + Sets the default value for the column. + pos : int + Sets the position of column in table. If unspecified, the position + will be randomly selected. + + """ + + _class_from_prefix = {} # filled as column classes are created + """Maps column prefixes to column classes.""" + + @classmethod + def prefix(cls): + """Return the column class prefix.""" + + cname = cls.__name__ + return cname[:cname.rfind('Col')] + + @classmethod + def from_atom(cls, atom, pos=None, _offset=None): + """Create a Col definition from a PyTables atom. + + An optional position may be specified as the pos argument. + + """ + + prefix = atom.prefix() + kwargs = atom._get_init_args() + colclass = cls._class_from_prefix[prefix] + return colclass(pos=pos, _offset=_offset, **kwargs) + + @classmethod + def from_sctype(cls, sctype, shape=(), dflt=None, pos=None): + """Create a `Col` definition from a NumPy scalar type `sctype`. + + Optional shape, default value and position may be specified as + the `shape`, `dflt` and `pos` arguments, respectively. + Information in the `sctype` not represented in a `Col` is + ignored. + + """ + + newatom = atom.Atom.from_sctype(sctype, shape, dflt) + return cls.from_atom(newatom, pos=pos) + + @classmethod + def from_dtype(cls, dtype, dflt=None, pos=None, _offset=None): + """Create a `Col` definition from a NumPy `dtype`. + + Optional default value and position may be specified as the + `dflt` and `pos` arguments, respectively. The `dtype` must have + a byte order which is irrelevant or compatible with that of the + system. Information in the `dtype` not represented in a `Col` + is ignored. + + """ + + newatom = atom.Atom.from_dtype(dtype, dflt) + return cls.from_atom(newatom, pos=pos, _offset=_offset) + + @classmethod + def from_type(cls, type, shape=(), dflt=None, pos=None): + """Create a `Col` definition from a PyTables `type`. + + Optional shape, default value and position may be specified as + the `shape`, `dflt` and `pos` arguments, respectively. + + """ + + newatom = atom.Atom.from_type(type, shape, dflt) + return cls.from_atom(newatom, pos=pos) + + @classmethod + def from_kind(cls, kind, itemsize=None, shape=(), dflt=None, pos=None): + """Create a `Col` definition from a PyTables `kind`. + + Optional item size, shape, default value and position may be + specified as the `itemsize`, `shape`, `dflt` and `pos` + arguments, respectively. Bear in mind that not all columns + support a default item size. + + """ + + newatom = atom.Atom.from_kind(kind, itemsize, shape, dflt) + return cls.from_atom(newatom, pos=pos) + + @classmethod + def _subclass_from_prefix(cls, prefix): + """Get a column subclass for the given `prefix`.""" + + cname = '%sCol' % prefix + class_from_prefix = cls._class_from_prefix + if cname in class_from_prefix: + return class_from_prefix[cname] + atombase = getattr(atom, '%sAtom' % prefix) + + class NewCol(cls, atombase): + """Defines a non-nested column of a particular type. + + The constructor accepts the same arguments as the equivalent + `Atom` class, plus an additional ``pos`` argument for + position information, which is assigned to the `_v_pos` + attribute. + + """ + + def __init__(self, *args, **kwargs): + pos = kwargs.pop('pos', None) + offset = kwargs.pop('_offset', None) + class_from_prefix = self._class_from_prefix + atombase.__init__(self, *args, **kwargs) + # The constructor of an abstract atom may have changed + # the class of `self` to something different of `NewCol` + # and `atombase` (that's why the prefix map is saved). + if self.__class__ is not NewCol: + colclass = class_from_prefix[self.prefix()] + self.__class__ = colclass + self._v_pos = pos + self._v_offset = offset + + __eq__ = same_position(atombase.__eq__) + _is_equal_to_atom = same_position(atombase._is_equal_to_atom) + + # XXX: API incompatible change for PyTables 3 line + # Overriding __eq__ blocks inheritance of __hash__ in 3.x + # def __hash__(self): + # return hash((self._v_pos, self.atombase)) + + if prefix == 'Enum': + _is_equal_to_enumatom = same_position( + atombase._is_equal_to_enumatom) + + NewCol.__name__ = cname + + class_from_prefix[prefix] = NewCol + return NewCol + + def __repr__(self): + # Reuse the atom representation. + atomrepr = super().__repr__() + lpar = atomrepr.index('(') + rpar = atomrepr.rindex(')') + atomargs = atomrepr[lpar + 1:rpar] + classname = self.__class__.__name__ + return f'{classname}({atomargs}, pos={self._v_pos})' + + def _get_init_args(self): + """Get a dictionary of instance constructor arguments.""" + + kwargs = {arg: getattr(self, arg) for arg in ('shape', 'dflt')} + kwargs['pos'] = getattr(self, '_v_pos', None) + return kwargs + + +def _generate_col_classes(): + """Generate all column classes.""" + + # Abstract classes are not in the class map. + cprefixes = ['Int', 'UInt', 'Float', 'Time'] + for (kind, kdata) in atom.atom_map.items(): + if hasattr(kdata, 'kind'): # atom class: non-fixed item size + atomclass = kdata + cprefixes.append(atomclass.prefix()) + else: # dictionary: fixed item size + for atomclass in kdata.values(): + cprefixes.append(atomclass.prefix()) + + # Bottom-level complex classes are not in the type map, of course. + # We still want the user to get the compatibility warning, though. + cprefixes.extend(['Complex32', 'Complex64', 'Complex128']) + if hasattr(atom, 'Complex192Atom'): + cprefixes.append('Complex192') + if hasattr(atom, 'Complex256Atom'): + cprefixes.append('Complex256') + + for cprefix in cprefixes: + newclass = Col._subclass_from_prefix(cprefix) + yield newclass + + +# Create all column classes. +# for _newclass in _generate_col_classes(): +# exec('%s = _newclass' % _newclass.__name__) +# del _newclass + +StringCol = Col._subclass_from_prefix('String') +BoolCol = Col._subclass_from_prefix('Bool') +EnumCol = Col._subclass_from_prefix('Enum') +IntCol = Col._subclass_from_prefix('Int') +Int8Col = Col._subclass_from_prefix('Int8') +Int16Col = Col._subclass_from_prefix('Int16') +Int32Col = Col._subclass_from_prefix('Int32') +Int64Col = Col._subclass_from_prefix('Int64') +UIntCol = Col._subclass_from_prefix('UInt') +UInt8Col = Col._subclass_from_prefix('UInt8') +UInt16Col = Col._subclass_from_prefix('UInt16') +UInt32Col = Col._subclass_from_prefix('UInt32') +UInt64Col = Col._subclass_from_prefix('UInt64') + +FloatCol = Col._subclass_from_prefix('Float') +if hasattr(atom, 'Float16Atom'): + Float16Col = Col._subclass_from_prefix('Float16') +Float32Col = Col._subclass_from_prefix('Float32') +Float64Col = Col._subclass_from_prefix('Float64') +if hasattr(atom, 'Float96Atom'): + Float96Col = Col._subclass_from_prefix('Float96') +if hasattr(atom, 'Float128Atom'): + Float128Col = Col._subclass_from_prefix('Float128') + +ComplexCol = Col._subclass_from_prefix('Complex') +Complex32Col = Col._subclass_from_prefix('Complex32') +Complex64Col = Col._subclass_from_prefix('Complex64') +Complex128Col = Col._subclass_from_prefix('Complex128') +if hasattr(atom, 'Complex192Atom'): + Complex192Col = Col._subclass_from_prefix('Complex192') +if hasattr(atom, 'Complex256Atom'): + Complex256Col = Col._subclass_from_prefix('Complex256') + +TimeCol = Col._subclass_from_prefix('Time') +Time32Col = Col._subclass_from_prefix('Time32') +Time64Col = Col._subclass_from_prefix('Time64') + + +# Table description classes +# ========================= +class Description: + """This class represents descriptions of the structure of tables. + + An instance of this class is automatically bound to Table (see + :ref:`TableClassDescr`) objects when they are created. It provides a + browseable representation of the structure of the table, made of non-nested + (Col - see :ref:`ColClassDescr`) and nested (Description) columns. + + Column definitions under a description can be accessed as attributes of it + (*natural naming*). For instance, if table.description is a Description + instance with a column named col1 under it, the later can be accessed as + table.description.col1. If col1 is nested and contains a col2 column, this + can be accessed as table.description.col1.col2. Because of natural naming, + the names of members start with special prefixes, like in the Group class + (see :ref:`GroupClassDescr`). + + + .. rubric:: Description attributes + + .. attribute:: _v_colobjects + + A dictionary mapping the names of the columns hanging + directly from the associated table or nested column to their + respective descriptions (Col - see :ref:`ColClassDescr` or + Description - see :ref:`DescriptionClassDescr` instances). + + .. versionchanged:: 3.0 + The *_v_colObjects* attribute has been renamed into + *_v_colobjects*. + + .. attribute:: _v_dflts + + A dictionary mapping the names of non-nested columns + hanging directly from the associated table or nested column + to their respective default values. + + .. attribute:: _v_dtype + + The NumPy type which reflects the structure of this + table or nested column. You can use this as the + dtype argument of NumPy array factories. + + .. attribute:: _v_dtypes + + A dictionary mapping the names of non-nested columns + hanging directly from the associated table or nested column + to their respective NumPy types. + + .. attribute:: _v_is_nested + + Whether the associated table or nested column contains + further nested columns or not. + + .. attribute:: _v_itemsize + + The size in bytes of an item in this table or nested column. + + .. attribute:: _v_name + + The name of this description group. The name of the + root group is '/'. + + .. attribute:: _v_names + + A list of the names of the columns hanging directly + from the associated table or nested column. The order of the + names matches the order of their respective columns in the + containing table. + + .. attribute:: _v_nested_descr + + A nested list of pairs of (name, format) tuples for all the columns + under this table or nested column. You can use this as the dtype and + descr arguments of NumPy array factories. + + .. versionchanged:: 3.0 + The *_v_nestedDescr* attribute has been renamed into + *_v_nested_descr*. + + .. attribute:: _v_nested_formats + + A nested list of the NumPy string formats (and shapes) of all the + columns under this table or nested column. You can use this as the + formats argument of NumPy array factories. + + .. versionchanged:: 3.0 + The *_v_nestedFormats* attribute has been renamed into + *_v_nested_formats*. + + .. attribute:: _v_nestedlvl + + The level of the associated table or nested column in the nested + datatype. + + .. attribute:: _v_nested_names + + A nested list of the names of all the columns under this table or + nested column. You can use this as the names argument of NumPy array + factories. + + .. versionchanged:: 3.0 + The *_v_nestedNames* attribute has been renamed into + *_v_nested_names*. + + .. attribute:: _v_pathname + + Pathname of the table or nested column. + + .. attribute:: _v_pathnames + + A list of the pathnames of all the columns under this table or nested + column (in preorder). If it does not contain nested columns, this is + exactly the same as the :attr:`Description._v_names` attribute. + + .. attribute:: _v_types + + A dictionary mapping the names of non-nested columns hanging directly + from the associated table or nested column to their respective PyTables + types. + + .. attribute:: _v_offsets + + A list of offsets for all the columns. If the list is empty, means + that there are no padding in the data structure. However, the support + for offsets is currently limited to flat tables; for nested tables, the + potential padding is always removed (exactly the same as in pre-3.5 + versions), and this variable is set to empty. + + .. versionadded:: 3.5 + Previous to this version all the compound types were converted + internally to 'packed' types, i.e. with no padding between the + component types. Starting with 3.5, the holes in native HDF5 + types (non-nested) are honored and replicated during dataset + and attribute copies. + """ + + def __init__(self, classdict, nestedlvl=-1, validate=True, ptparams=None): + + if not classdict: + raise ValueError("cannot create an empty data type") + + # Do a shallow copy of classdict just in case this is going to + # be shared by other instances + newdict = self.__dict__ + newdict["_v_name"] = "/" # The name for root descriptor + newdict["_v_names"] = [] + newdict["_v_dtypes"] = {} + newdict["_v_types"] = {} + newdict["_v_dflts"] = {} + newdict["_v_colobjects"] = {} + newdict["_v_is_nested"] = False + nestedFormats = [] + nestedDType = [] + + if not hasattr(newdict, "_v_nestedlvl"): + newdict["_v_nestedlvl"] = nestedlvl + 1 + + cols_with_pos = [] # colum (position, name) pairs + cols_no_pos = [] # just column names + cols_offsets = [] # the offsets of the columns + valid_offsets = False # by default there a no valid offsets + + # Check for special variables and convert column descriptions + for (name, descr) in classdict.items(): + if name.startswith('_v_'): + if name in newdict: + # print("Warning!") + # special methods &c: copy to newdict, warn about conflicts + warnings.warn("Can't set attr %r in description class %r" + % (name, self)) + else: + # print("Special variable!-->", name, classdict[name]) + newdict[name] = descr + continue # This variable is not needed anymore + + columns = None + if (type(descr) == type(IsDescription) and + issubclass(descr, IsDescription)): + # print("Nested object (type I)-->", name) + columns = descr().columns + elif (type(descr.__class__) == type(IsDescription) and + issubclass(descr.__class__, IsDescription)): + # print("Nested object (type II)-->", name) + columns = descr.columns + elif isinstance(descr, dict): + # print("Nested object (type III)-->", name) + columns = descr + else: + # print("Nested object (type IV)-->", name) + descr = copy.copy(descr) + # The copies above and below ensure that the structures + # provided by the user will remain unchanged even if we + # tamper with the values of ``_v_pos`` here. + if columns is not None: + descr = Description(copy.copy(columns), self._v_nestedlvl, + ptparams=ptparams) + classdict[name] = descr + + pos = getattr(descr, '_v_pos', None) + if pos is None: + cols_no_pos.append(name) + else: + cols_with_pos.append((pos, name)) + offset = getattr(descr, '_v_offset', None) + if offset is not None: + cols_offsets.append(offset) + + # Sort field names: + # + # 1. Fields with explicit positions, according to their + # positions (and their names if coincident). + # 2. Fields with no position, in alphabetical order. + cols_with_pos.sort() + cols_no_pos.sort() + keys = [name for (pos, name) in cols_with_pos] + cols_no_pos + + pos = 0 + nested = False + # Get properties for compound types + for k in keys: + if validate: + # Check for key name validity + check_name_validity(k) + # Class variables + object = classdict[k] + newdict[k] = object # To allow natural naming + if not isinstance(object, (Col, Description)): + raise TypeError('Passing an incorrect value to a table column.' + ' Expected a Col (or subclass) instance and ' + 'got: "%s". Please make use of the Col(), or ' + 'descendant, constructor to properly ' + 'initialize columns.' % object) + object._v_pos = pos # Set the position of this object + object._v_parent = self # The parent description + pos += 1 + newdict['_v_colobjects'][k] = object + newdict['_v_names'].append(k) + object.__dict__['_v_name'] = k + + if not isinstance(k, str): + # numpy only accepts "str" for field names + # Python 3.x: bytes --> str (unicode) + kk = k.decode() + else: + kk = k + + if isinstance(object, Col): + dtype = object.dtype + newdict['_v_dtypes'][k] = dtype + newdict['_v_types'][k] = object.type + newdict['_v_dflts'][k] = object.dflt + nestedFormats.append(object.recarrtype) + baserecarrtype = dtype.base.str[1:] + nestedDType.append((kk, baserecarrtype, dtype.shape)) + else: # A description + nestedFormats.append(object._v_nested_formats) + nestedDType.append((kk, object._v_dtype)) + nested = True + + # Useful for debugging purposes + # import traceback + # if ptparams is None: + # print("*** print_stack:") + # traceback.print_stack() + + # Check whether we are gonna use padding or not. Two possibilities: + # 1) Make padding True by default (except if ALLOW_PADDING is set + # to False) + # 2) Make padding False by default (except if ALLOW_PADDING is set + # to True) + # Currently we choose 1) because it favours honoring padding even on + # unhandled situations (should be very few). + # However, for development, option 2) is recommended as it catches + # most of the unhandled situations. + allow_padding = ptparams is None or ptparams['ALLOW_PADDING'] + # allow_padding = ptparams is not None and ptparams['ALLOW_PADDING'] + if (allow_padding and + len(cols_offsets) > 1 and + len(keys) == len(cols_with_pos) and + len(keys) == len(cols_offsets) and + not nested): # TODO: support offsets with nested types + # We have to sort the offsets too, as they must follow the column + # order. As the offsets and the pos should be place in the same + # order, a single sort is enough here. + cols_offsets.sort() + valid_offsets = True + else: + newdict['_v_offsets'] = [] + + # Assign the format list to _v_nested_formats + newdict['_v_nested_formats'] = nestedFormats + + if self._v_nestedlvl == 0: + # Get recursively nested _v_nested_names and _v_nested_descr attrs + self._g_set_nested_names_descr() + # Get pathnames for nested groups + self._g_set_path_names() + # Check the _v_byteorder has been used an issue an Error + if hasattr(self, "_v_byteorder"): + raise ValueError( + "Using a ``_v_byteorder`` in the description is obsolete. " + "Use the byteorder parameter in the constructor instead.") + + # Compute the dtype with offsets or without + # print("offsets ->", cols_offsets, nestedDType, nested, valid_offsets) + if valid_offsets: + # TODO: support offsets within nested types + dtype_fields = { + 'names': newdict['_v_names'], 'formats': nestedFormats, + 'offsets': cols_offsets} + itemsize = newdict.get('_v_itemsize', None) + if itemsize is not None: + dtype_fields['itemsize'] = itemsize + dtype = np.dtype(dtype_fields) + else: + dtype = np.dtype(nestedDType) + newdict['_v_dtype'] = dtype + newdict['_v_itemsize'] = dtype.itemsize + newdict['_v_offsets'] = [dtype.fields[name][1] for name in dtype.names] + + def _g_set_nested_names_descr(self): + """Computes the nested names and descriptions for nested datatypes.""" + + names = self._v_names + fmts = self._v_nested_formats + self._v_nested_names = names[:] # Important to do a copy! + self._v_nested_descr = list(zip(names, fmts)) + for i, name in enumerate(names): + new_object = self._v_colobjects[name] + if isinstance(new_object, Description): + new_object._g_set_nested_names_descr() + # replace the column nested name by a correct tuple + self._v_nested_names[i] = (name, new_object._v_nested_names) + self._v_nested_descr[i] = (name, new_object._v_nested_descr) + # set the _v_is_nested flag + self._v_is_nested = True + + def _g_set_path_names(self): + """Compute the pathnames for arbitrary nested descriptions. + + This method sets the ``_v_pathname`` and ``_v_pathnames`` + attributes of all the elements (both descriptions and columns) + in this nested description. + + """ + + def get_cols_in_order(description): + return [description._v_colobjects[colname] + for colname in description._v_names] + + def join_paths(path1, path2): + if not path1: + return path2 + return f'{path1}/{path2}' + + # The top of the stack always has a nested description + # and a list of its child columns + # (be they nested ``Description`` or non-nested ``Col`` objects). + # In the end, the list contains only a list of column paths + # under this one. + # + # For instance, given this top of the stack:: + # + # (, [, ]) + # + # After computing the rest of the stack, the top is:: + # + # (, ['a', 'a/m', 'a/n', ... , 'b', ...]) + + stack = [] + + # We start by pushing the top-level description + # and its child columns. + self._v_pathname = '' + stack.append((self, get_cols_in_order(self))) + + while stack: + desc, cols = stack.pop() + head = cols[0] + + # What's the first child in the list? + if isinstance(head, Description): + # A nested description. We remove it from the list and + # push it with its child columns. This will be the next + # handled description. + head._v_pathname = join_paths(desc._v_pathname, head._v_name) + stack.append((desc, cols[1:])) # alter the top + stack.append((head, get_cols_in_order(head))) # new top + elif isinstance(head, Col): + # A non-nested column. We simply remove it from the + # list and append its name to it. + head._v_pathname = join_paths(desc._v_pathname, head._v_name) + cols.append(head._v_name) # alter the top + stack.append((desc, cols[1:])) # alter the top + else: + # Since paths and names are appended *to the end* of + # children lists, a string signals that no more children + # remain to be processed, so we are done with the + # description at the top of the stack. + assert isinstance(head, str) + # Assign the computed set of descendent column paths. + desc._v_pathnames = cols + if len(stack) > 0: + # Compute the paths with respect to the parent node + # (including the path of the current description) + # and append them to its list. + descName = desc._v_name + colPaths = [join_paths(descName, path) for path in cols] + colPaths.insert(0, descName) + parentCols = stack[-1][1] + parentCols.extend(colPaths) + # (Nothing is pushed, we are done with this description.) + + def _f_walk(self, type='All'): + """Iterate over nested columns. + + If type is 'All' (the default), all column description objects (Col and + Description instances) are yielded in top-to-bottom order (preorder). + + If type is 'Col' or 'Description', only column descriptions of that + type are yielded. + + """ + + if type not in ["All", "Col", "Description"]: + raise ValueError("""\ +type can only take the parameters 'All', 'Col' or 'Description'.""") + + stack = [self] + while stack: + object = stack.pop(0) # pop at the front so as to ensure the order + if type in ["All", "Description"]: + yield object # yield description + for name in object._v_names: + new_object = object._v_colobjects[name] + if isinstance(new_object, Description): + stack.append(new_object) + else: + if type in ["All", "Col"]: + yield new_object # yield column + + def __repr__(self): + """Gives a detailed Description column representation.""" + + rep = ['%s\"%s\": %r' % + (" " * self._v_nestedlvl, k, self._v_colobjects[k]) + for k in self._v_names] + return '{\n %s}' % (',\n '.join(rep)) + + def __str__(self): + """Gives a brief Description representation.""" + + return f'Description({self._v_nested_descr})' + + +class MetaIsDescription(type): + """Helper metaclass to return the class variables as a dictionary.""" + + def __new__(mcs, classname, bases, classdict): + """Return a new class with a "columns" attribute filled.""" + + newdict = {"columns": {}, } + if '__doc__' in classdict: + newdict['__doc__'] = classdict['__doc__'] + for b in bases: + if "columns" in b.__dict__: + newdict["columns"].update(b.__dict__["columns"]) + for k in classdict: + # if not (k.startswith('__') or k.startswith('_v_')): + # We let pass _v_ variables to configure class behaviour + if not (k.startswith('__')): + newdict["columns"][k] = classdict[k] + + # Return a new class with the "columns" attribute filled + return type.__new__(mcs, classname, bases, newdict) + + +class IsDescription(metaclass=MetaIsDescription): + """Description of the structure of a table or nested column. + + This class is designed to be used as an easy, yet meaningful way to + describe the structure of new Table (see :ref:`TableClassDescr`) datasets + or nested columns through the definition of *derived classes*. In order to + define such a class, you must declare it as descendant of IsDescription, + with as many attributes as columns you want in your table. The name of each + attribute will become the name of a column, and its value will hold a + description of it. + + Ordinary columns can be described using instances of the Col class (see + :ref:`ColClassDescr`). Nested columns can be described by using classes + derived from IsDescription, instances of it, or name-description + dictionaries. Derived classes can be declared in place (in which case the + column takes the name of the class) or referenced by name. + + Nested columns can have a _v_pos special attribute which sets the + *relative* position of the column among sibling columns *also having + explicit positions*. The pos constructor argument of Col instances is used + for the same purpose. Columns with no explicit position will be placed + afterwards in alphanumeric order. + + Once you have created a description object, you can pass it to the Table + constructor, where all the information it contains will be used to define + the table structure. + + .. rubric:: IsDescription attributes + + .. attribute:: _v_pos + + Sets the position of a possible nested column description among its + sibling columns. This attribute can be specified *when declaring* + an IsDescription subclass to complement its *metadata*. + + .. attribute:: columns + + Maps the name of each column in the description to its own descriptive + object. This attribute is *automatically created* when an IsDescription + subclass is declared. Please note that declared columns can no longer + be accessed as normal class variables after its creation. + + """ + + +def descr_from_dtype(dtype_, ptparams=None): + """Get a description instance and byteorder from a (nested) NumPy dtype.""" + + fields = {} + fbyteorder = '|' + for name in dtype_.names: + dtype, offset = dtype_.fields[name][:2] + kind = dtype.base.kind + byteorder = dtype.base.byteorder + if byteorder in '><=': + if fbyteorder not in ['|', byteorder]: + raise NotImplementedError( + "structured arrays with mixed byteorders " + "are not supported yet, sorry") + fbyteorder = byteorder + # Non-nested column + if kind in 'biufSUc': + col = Col.from_dtype(dtype, pos=offset, _offset=offset) + # Nested column + elif kind == 'V' and dtype.shape in [(), (1,)]: + if dtype.shape != (): + warnings.warn( + "nested descriptions will be converted to scalar") + col, _ = descr_from_dtype(dtype.base, ptparams=ptparams) + col._v_pos = offset + col._v_offset = offset + else: + raise NotImplementedError( + "structured arrays with columns with type description ``%s`` " + "are not supported yet, sorry" % dtype) + fields[name] = col + + return Description(fields, ptparams=ptparams), fbyteorder + + +def dtype_from_descr(descr, byteorder=None, ptparams=None): + """Get a (nested) NumPy dtype from a description instance and byteorder. + + The descr parameter can be a Description or IsDescription + instance, sub-class of IsDescription or a dictionary. + + """ + + if isinstance(descr, dict): + descr = Description(descr, ptparams=ptparams) + elif (type(descr) == type(IsDescription) + and issubclass(descr, IsDescription)): + descr = Description(descr().columns, ptparams=ptparams) + elif isinstance(descr, IsDescription): + descr = Description(descr.columns, ptparams=ptparams) + elif not isinstance(descr, Description): + raise ValueError('invalid description: %r' % descr) + + dtype_ = descr._v_dtype + + if byteorder and byteorder != '|': + dtype_ = dtype_.newbyteorder(byteorder) + + return dtype_ + + +if __name__ == "__main__": + """Test code.""" + + class Info(IsDescription): + _v_pos = 2 + Name = UInt32Col() + Value = Float64Col() + + class Test(IsDescription): + """A description that has several columns.""" + + x = Col.from_type("int32", 2, 0, pos=0) + y = Col.from_kind('float', dflt=1, shape=(2, 3)) + z = UInt8Col(dflt=1) + color = StringCol(2, dflt=" ") + # color = UInt32Col(2) + Info = Info() + + class info(IsDescription): + _v_pos = 1 + name = UInt32Col() + value = Float64Col(pos=0) + y2 = Col.from_kind('float', dflt=1, shape=(2, 3), pos=1) + z2 = UInt8Col(dflt=1) + + class info2(IsDescription): + y3 = Col.from_kind('float', dflt=1, shape=(2, 3)) + z3 = UInt8Col(dflt=1) + name = UInt32Col() + value = Float64Col() + + class info3(IsDescription): + name = UInt32Col() + value = Float64Col() + y4 = Col.from_kind('float', dflt=1, shape=(2, 3)) + z4 = UInt8Col(dflt=1) + +# class Info(IsDescription): +# _v_pos = 2 +# Name = StringCol(itemsize=2) +# Value = ComplexCol(itemsize=16) + +# class Test(IsDescription): +# """A description that has several columns""" +# x = Col.from_type("int32", 2, 0, pos=0) +# y = Col.from_kind('float', dflt=1, shape=(2,3)) +# z = UInt8Col(dflt=1) +# color = StringCol(2, dflt=" ") +# Info = Info() +# class info(IsDescription): +# _v_pos = 1 +# name = StringCol(itemsize=2) +# value = ComplexCol(itemsize=16, pos=0) +# y2 = Col.from_kind('float', dflt=1, shape=(2,3), pos=1) +# z2 = UInt8Col(dflt=1) +# class info2(IsDescription): +# y3 = Col.from_kind('float', dflt=1, shape=(2,3)) +# z3 = UInt8Col(dflt=1) +# name = StringCol(itemsize=2) +# value = ComplexCol(itemsize=16) +# class info3(IsDescription): +# name = StringCol(itemsize=2) +# value = ComplexCol(itemsize=16) +# y4 = Col.from_kind('float', dflt=1, shape=(2,3)) +# z4 = UInt8Col(dflt=1) + + # example cases of class Test + klass = Test() + # klass = Info() + desc = Description(klass.columns) + print("Description representation (short) ==>", desc) + print("Description representation (long) ==>", repr(desc)) + print("Column names ==>", desc._v_names) + print("Column x ==>", desc.x) + print("Column Info ==>", desc.Info) + print("Column Info.value ==>", desc.Info.Value) + print("Nested column names ==>", desc._v_nested_names) + print("Defaults ==>", desc._v_dflts) + print("Nested Formats ==>", desc._v_nested_formats) + print("Nested Descriptions ==>", desc._v_nested_descr) + print("Nested Descriptions (info) ==>", desc.info._v_nested_descr) + print("Total size ==>", desc._v_dtype.itemsize) + + # check _f_walk + for object in desc._f_walk(): + if isinstance(object, Description): + print("******begin object*************", end=' ') + print("name -->", object._v_name) + # print("name -->", object._v_dtype.name) + # print("object childs-->", object._v_names) + # print("object nested childs-->", object._v_nested_names) + print("totalsize-->", object._v_dtype.itemsize) + else: + # pass + print("leaf -->", object._v_name, object.dtype) + + class testDescParent(IsDescription): + c = Int32Col() + + class testDesc(testDescParent): + pass + + assert 'c' in testDesc.columns diff --git a/tables/earray.py b/tables/earray.py new file mode 100644 index 0000000..c8f8e45 --- /dev/null +++ b/tables/earray.py @@ -0,0 +1,241 @@ +"""Here is defined the EArray class.""" + +import numpy as np + +from .utils import convert_to_np_atom2, SizeType +from .carray import CArray + + +# default version for EARRAY objects +# obversion = "1.0" # initial version +# obversion = "1.1" # support for complex datatypes +# obversion = "1.2" # This adds support for time datatypes. +# obversion = "1.3" # This adds support for enumerated datatypes. +obversion = "1.4" # Numeric and numarray flavors are gone. + + +class EArray(CArray): + """This class represents extendable, homogeneous datasets in an HDF5 file. + + The main difference between an EArray and a CArray (see + :ref:`CArrayClassDescr`), from which it inherits, is that the former + can be enlarged along one of its dimensions, the *enlargeable + dimension*. That means that the :attr:`Leaf.extdim` attribute (see + :class:`Leaf`) of any EArray instance will always be non-negative. + Multiple enlargeable dimensions might be supported in the future. + + New rows can be added to the end of an enlargeable array by using the + :meth:`EArray.append` method. + + Parameters + ---------- + parentnode + The parent :class:`Group` object. + + .. versionchanged:: 3.0 + Renamed from *parentNode* to *parentnode*. + + name : str + The name of this node in its parent group. + + atom + An `Atom` instance representing the *type* and *shape* + of the atomic objects to be saved. + + shape + The shape of the new array. One (and only one) of + the shape dimensions *must* be 0. The dimension being 0 + means that the resulting `EArray` object can be extended + along it. Multiple enlargeable dimensions are not supported + right now. + + title + A description for this node (it sets the ``TITLE`` + HDF5 attribute on disk). + + filters + An instance of the `Filters` class that provides information + about the desired I/O filters to be applied during the life + of this object. + + expectedrows + A user estimate about the number of row elements that will + be added to the growable dimension in the `EArray` node. + If not provided, the default value is ``EXPECTED_ROWS_EARRAY`` + (see ``tables/parameters.py``). If you plan to create either + a much smaller or a much bigger `EArray` try providing a guess; + this will optimize the HDF5 B-Tree creation and management + process time and the amount of memory used. + + chunkshape + The shape of the data chunk to be read or written in a single + HDF5 I/O operation. Filters are applied to those chunks of data. + The dimensionality of `chunkshape` must be the same as that of + `shape` (beware: no dimension should be 0 this time!). + If ``None``, a sensible value is calculated based on the + `expectedrows` parameter (which is recommended). + + byteorder + The byteorder of the data *on disk*, specified as 'little' or + 'big'. If this is not specified, the byteorder is that of the + platform. + + track_times + Whether time data associated with the leaf are recorded (object + access time, raw data modification time, metadata change time, object + birth time); default True. Semantics of these times depend on their + implementation in the HDF5 library: refer to documentation of the + H5O_info_t data structure. As of HDF5 1.8.15, only ctime (metadata + change time) is implemented. + + .. versionadded:: 3.4.3 + + Examples + -------- + + See below a small example of the use of the `EArray` class. The + code is available in ``examples/earray1.py``:: + + import numpy as np + import tables as tb + + fileh = tb.open_file('earray1.h5', mode='w') + a = tb.StringAtom(itemsize=8) + + # Use ``a`` as the object type for the enlargeable array. + array_c = fileh.create_earray(fileh.root, 'array_c', a, (0,), + \"Chars\") + array_c.append(np.array(['a'*2, 'b'*4], dtype='S8')) + array_c.append(np.array(['a'*6, 'b'*8, 'c'*10], dtype='S8')) + + # Read the string ``EArray`` we have created on disk. + for s in array_c: + print('array_c[%s] => %r' % (array_c.nrow, s)) + # Close the file. + fileh.close() + + The output for the previous script is something like:: + + array_c[0] => 'aa' + array_c[1] => 'bbbb' + array_c[2] => 'aaaaaa' + array_c[3] => 'bbbbbbbb' + array_c[4] => 'cccccccc' + + """ + + # Class identifier. + _c_classid = 'EARRAY' + + def __init__(self, parentnode, name, + atom=None, shape=None, title="", + filters=None, expectedrows=None, + chunkshape=None, byteorder=None, + _log=True, track_times=True): + + # Specific of EArray + if expectedrows is None: + expectedrows = parentnode._v_file.params['EXPECTED_ROWS_EARRAY'] + self._v_expectedrows = expectedrows + """The expected number of rows to be stored in the array.""" + + # Call the parent (CArray) init code + super().__init__(parentnode, name, atom, shape, title, filters, + chunkshape, byteorder, _log, track_times) + + def _g_create(self): + """Create a new array in file (specific part).""" + + # Pre-conditions and extdim computation + zerodims = np.sum(np.array(self.shape) == 0) + if zerodims > 0: + if zerodims == 1: + self.extdim = list(self.shape).index(0) + else: + raise NotImplementedError( + "Multiple enlargeable (0-)dimensions are not " + "supported.") + else: + raise ValueError( + "When creating EArrays, you need to set one of " + "the dimensions of the Atom instance to zero.") + + # Finish the common part of the creation process + return self._g_create_common(self._v_expectedrows) + + def _check_shape_append(self, nparr): + """Test that nparr shape is consistent with underlying EArray.""" + + # The arrays conforms self expandibility? + myrank = len(self.shape) + narank = len(nparr.shape) - len(self.atom.shape) + if myrank != narank: + raise ValueError(("the ranks of the appended object (%d) and the " + "``%s`` EArray (%d) differ") + % (narank, self._v_pathname, myrank)) + for i in range(myrank): + if i != self.extdim and self.shape[i] != nparr.shape[i]: + raise ValueError(("the shapes of the appended object and the " + "``%s`` EArray differ in non-enlargeable " + "dimension %d") % (self._v_pathname, i)) + + def append(self, sequence): + """Add a sequence of data to the end of the dataset. + + The sequence must have the same type as the array; otherwise a + TypeError is raised. In the same way, the dimensions of the + sequence must conform to the shape of the array, that is, all + dimensions must match, with the exception of the enlargeable + dimension, which can be of any length (even 0!). If the shape + of the sequence is invalid, a ValueError is raised. + + """ + + self._g_check_open() + self._v_file._check_writable() + + # Convert the sequence into a NumPy object + nparr = convert_to_np_atom2(sequence, self.atom) + # Check if it has a consistent shape with underlying EArray + self._check_shape_append(nparr) + # If the size of the nparr is zero, don't do anything else + if nparr.size > 0: + self._append(nparr) + + def _g_copy_with_stats(self, group, name, start, stop, step, + title, filters, chunkshape, _log, **kwargs): + """Private part of Leaf.copy() for each kind of leaf.""" + + (start, stop, step) = self._process_range_read(start, stop, step) + # Build the new EArray object + maindim = self.maindim + shape = list(self.shape) + shape[maindim] = 0 + # The number of final rows + nrows = len(range(start, stop, step)) + # Build the new EArray object + object = EArray( + group, name, atom=self.atom, shape=shape, title=title, + filters=filters, expectedrows=nrows, chunkshape=chunkshape, + _log=_log) + # Now, fill the new earray with values from source + nrowsinbuf = self.nrowsinbuf + # The slices parameter for self.__getitem__ + slices = [slice(0, dim, 1) for dim in self.shape] + # This is a hack to prevent doing unnecessary conversions + # when copying buffers + self._v_convert = False + # Start the copy itself + for start2 in range(start, stop, step * nrowsinbuf): + # Save the records on disk + stop2 = start2 + step * nrowsinbuf + if stop2 > stop: + stop2 = stop + # Set the proper slice in the extensible dimension + slices[maindim] = slice(start2, stop2, step) + object._append(self.__getitem__(tuple(slices))) + # Active the conversion again (default) + self._v_convert = True + nbytes = np.prod(self.shape, dtype=SizeType) * self.atom.itemsize + + return (object, nbytes) diff --git a/tables/exceptions.py b/tables/exceptions.py new file mode 100644 index 0000000..5b3eb5a --- /dev/null +++ b/tables/exceptions.py @@ -0,0 +1,366 @@ +"""Declare exceptions and warnings that are specific to PyTables.""" + +import os +import warnings +import traceback + + +__docformat__ = 'reStructuredText' +"""The format of documentation strings in this module.""" + + +class HDF5ExtError(RuntimeError): + """A low level HDF5 operation failed. + + This exception is raised the low level PyTables components used for + accessing HDF5 files. It usually signals that something is not + going well in the HDF5 library or even at the Input/Output level. + + Errors in the HDF5 C library may be accompanied by an extensive + HDF5 back trace on standard error (see also + :func:`tables.silence_hdf5_messages`). + + .. versionchanged:: 2.4 + + Parameters + ---------- + message + error message + h5bt + This parameter (keyword only) controls the HDF5 back trace + handling. Any keyword arguments other than h5bt is ignored. + + * if set to False the HDF5 back trace is ignored and the + :attr:`HDF5ExtError.h5backtrace` attribute is set to None + * if set to True the back trace is retrieved from the HDF5 + library and stored in the :attr:`HDF5ExtError.h5backtrace` + attribute as a list of tuples + * if set to "VERBOSE" (default) the HDF5 back trace is + stored in the :attr:`HDF5ExtError.h5backtrace` attribute + and also included in the string representation of the + exception + * if not set (or set to None) the default policy is used + (see :attr:`HDF5ExtError.DEFAULT_H5_BACKTRACE_POLICY`) + + """ + + # NOTE: in order to avoid circular dependencies between modules the + # _dump_h5_backtrace method is set at initialization time in + # the utilsExtenion. + _dump_h5_backtrace = None + + DEFAULT_H5_BACKTRACE_POLICY = "VERBOSE" + """Default policy for HDF5 backtrace handling + + * if set to False the HDF5 back trace is ignored and the + :attr:`HDF5ExtError.h5backtrace` attribute is set to None + * if set to True the back trace is retrieved from the HDF5 + library and stored in the :attr:`HDF5ExtError.h5backtrace` + attribute as a list of tuples + * if set to "VERBOSE" (default) the HDF5 back trace is + stored in the :attr:`HDF5ExtError.h5backtrace` attribute + and also included in the string representation of the + exception + + This parameter can be set using the + :envvar:`PT_DEFAULT_H5_BACKTRACE_POLICY` environment variable. + Allowed values are "IGNORE" (or "FALSE"), "SAVE" (or "TRUE") and + "VERBOSE" to set the policy to False, True and "VERBOSE" + respectively. The special value "DEFAULT" can be used to reset + the policy to the default value + + .. versionadded:: 2.4 + """ + + @classmethod + def set_policy_from_env(cls): + envmap = { + "IGNORE": False, + "FALSE": False, + "SAVE": True, + "TRUE": True, + "VERBOSE": "VERBOSE", + "DEFAULT": "VERBOSE", + } + oldvalue = cls.DEFAULT_H5_BACKTRACE_POLICY + envvalue = os.environ.get("PT_DEFAULT_H5_BACKTRACE_POLICY", "DEFAULT") + try: + newvalue = envmap[envvalue.upper()] + except KeyError: + warnings.warn("Invalid value for the environment variable " + "'PT_DEFAULT_H5_BACKTRACE_POLICY'. The default " + "policy for HDF5 back trace management in PyTables " + "will be: '%s'" % oldvalue) + else: + cls.DEFAULT_H5_BACKTRACE_POLICY = newvalue + + return oldvalue + + def __init__(self, *args, **kargs): + + super().__init__(*args) + + self._h5bt_policy = kargs.get('h5bt', self.DEFAULT_H5_BACKTRACE_POLICY) + + if self._h5bt_policy and self._dump_h5_backtrace is not None: + self.h5backtrace = self._dump_h5_backtrace() + """HDF5 back trace. + + Contains the HDF5 back trace as a (possibly empty) list of + tuples. Each tuple has the following format:: + + (filename, line number, function name, text) + + Depending on the value of the *h5bt* parameter passed to the + initializer the h5backtrace attribute can be set to None. + This means that the HDF5 back trace has been simply ignored + (not retrieved from the HDF5 C library error stack) or that + there has been an error (silently ignored) during the HDF5 back + trace retrieval. + + .. versionadded:: 2.4 + + See Also + -------- + traceback.format_list : :func:`traceback.format_list` + + """ + + # XXX: check _dump_h5_backtrace failures + else: + self.h5backtrace = None + + def __str__(self): + """Returns a sting representation of the exception. + + The actual result depends on policy set in the initializer + :meth:`HDF5ExtError.__init__`. + + .. versionadded:: 2.4 + + """ + + verbose = bool(self._h5bt_policy in ('VERBOSE', 'verbose')) + + if verbose and self.h5backtrace: + bt = "\n".join([ + "HDF5 error back trace\n", + self.format_h5_backtrace(), + "End of HDF5 error back trace" + ]) + + if len(self.args) == 1 and isinstance(self.args[0], str): + msg = super().__str__() + msg = f"{bt}\n\n{msg}" + elif self.h5backtrace[-1][-1]: + msg = f"{bt}\n\n{self.h5backtrace[-1][-1]}" + else: + msg = bt + else: + msg = super().__str__() + + return msg + + def format_h5_backtrace(self, backtrace=None): + """Convert the HDF5 trace back represented as a list of tuples. + (see :attr:`HDF5ExtError.h5backtrace`) into a string. + + .. versionadded:: 2.4 + + """ + if backtrace is None: + backtrace = self.h5backtrace + + if backtrace is None: + return 'No HDF5 back trace available' + else: + return ''.join(traceback.format_list(backtrace)) + + +# Initialize the policy for HDF5 back trace handling +HDF5ExtError.set_policy_from_env() + + +# The following exceptions are concretions of the ``ValueError`` exceptions +# raised by ``file`` objects on certain operations. + +class ClosedNodeError(ValueError): + """The operation can not be completed because the node is closed. + + For instance, listing the children of a closed group is not allowed. + + """ + + pass + + +class ClosedFileError(ValueError): + """The operation can not be completed because the hosting file is closed. + + For instance, getting an existing node from a closed file is not + allowed. + + """ + + pass + + +class FileModeError(ValueError): + """The operation can not be carried out because the mode in which the + hosting file is opened is not adequate. + + For instance, removing an existing leaf from a read-only file is not + allowed. + + """ + + pass + + +class NodeError(AttributeError, LookupError): + """Invalid hierarchy manipulation operation requested. + + This exception is raised when the user requests an operation on the + hierarchy which can not be run because of the current layout of the + tree. This includes accessing nonexistent nodes, moving or copying + or creating over an existing node, non-recursively removing groups + with children, and other similarly invalid operations. + + A node in a PyTables database cannot be simply overwritten by + replacing it. Instead, the old node must be removed explicitely + before another one can take its place. This is done to protect + interactive users from inadvertedly deleting whole trees of data by + a single erroneous command. + + """ + + pass + + +class NoSuchNodeError(NodeError): + """An operation was requested on a node that does not exist. + + This exception is raised when an operation gets a path name or a + ``(where, name)`` pair leading to a nonexistent node. + + """ + + pass + + +class UndoRedoError(Exception): + """Problems with doing/redoing actions with Undo/Redo feature. + + This exception indicates a problem related to the Undo/Redo + mechanism, such as trying to undo or redo actions with this + mechanism disabled, or going to a nonexistent mark. + + """ + + pass + + +class UndoRedoWarning(Warning): + """Issued when an action not supporting Undo/Redo is run. + + This warning is only shown when the Undo/Redo mechanism is enabled. + + """ + + pass + + +class NaturalNameWarning(Warning): + """Issued when a non-pythonic name is given for a node. + + This is not an error and may even be very useful in certain + contexts, but one should be aware that such nodes cannot be + accessed using natural naming (instead, ``getattr()`` must be + used explicitly). + """ + + pass + + +class PerformanceWarning(Warning): + """Warning for operations which may cause a performance drop. + + This warning is issued when an operation is made on the database + which may cause it to slow down on future operations (i.e. making + the node tree grow too much). + + """ + + pass + + +class FlavorError(ValueError): + """Unsupported or unavailable flavor or flavor conversion. + + This exception is raised when an unsupported or unavailable flavor + is given to a dataset, or when a conversion of data between two + given flavors is not supported nor available. + + """ + + pass + + +class FlavorWarning(Warning): + """Unsupported or unavailable flavor conversion. + + This warning is issued when a conversion of data between two given + flavors is not supported nor available, and raising an error would + render the data inaccessible (e.g. on a dataset of an unavailable + flavor in a read-only file). + + See the `FlavorError` class for more information. + + """ + + pass + + +class FiltersWarning(Warning): + """Unavailable filters. + + This warning is issued when a valid filter is specified but it is + not available in the system. It may mean that an available default + filter is to be used instead. + + """ + + pass + + +class OldIndexWarning(Warning): + """Unsupported index format. + + This warning is issued when an index in an unsupported format is + found. The index will be marked as invalid and will behave as if + doesn't exist. + + """ + + pass + + +class DataTypeWarning(Warning): + """Unsupported data type. + + This warning is issued when an unsupported HDF5 data type is found + (normally in a file created with other tool than PyTables). + + """ + + pass + + +class ExperimentalFeatureWarning(Warning): + """Generic warning for experimental features. + + This warning is issued when using a functionality that is still + experimental and that users have to use with care. + + """ + pass diff --git a/tables/expression.py b/tables/expression.py new file mode 100644 index 0000000..41a2c86 --- /dev/null +++ b/tables/expression.py @@ -0,0 +1,703 @@ +"""Here is defined the Expr class.""" + +import sys +import warnings + +import numexpr as ne +import numpy as np +import tables as tb + +from .exceptions import PerformanceWarning +from .parameters import IO_BUFFER_SIZE, BUFFER_TIMES + + +class Expr: + """A class for evaluating expressions with arbitrary array-like objects. + + Expr is a class for evaluating expressions containing array-like objects. + With it, you can evaluate expressions (like "3 * a + 4 * b") that + operate on arbitrary large arrays while optimizing the resources + required to perform them (basically main memory and CPU cache memory). + It is similar to the Numexpr package (see :ref:`[NUMEXPR] `), + but in addition to NumPy objects, it also accepts disk-based homogeneous + arrays, like the Array, CArray, EArray and Column PyTables objects. + + .. warning:: + + Expr class only offers a subset of the Numexpr features due to the + complexity of implement some of them when dealing with huge amount of + data. + + All the internal computations are performed via the Numexpr package, + so all the broadcast and upcasting rules of Numexpr applies here too. + These rules are very similar to the NumPy ones, but with some exceptions + due to the particularities of having to deal with potentially very large + disk-based arrays. Be sure to read the documentation of the Expr + constructor and methods as well as that of Numexpr, if you want to fully + grasp these particularities. + + + Parameters + ---------- + expr : str + This specifies the expression to be evaluated, such as "2 * a + 3 * b". + uservars : dict + This can be used to define the variable names appearing in *expr*. + This mapping should consist of identifier-like strings pointing to any + `Array`, `CArray`, `EArray`, `Column` or NumPy ndarray instances (or + even others which will tried to be converted to ndarrays). When + `uservars` is not provided or `None`, the current local and global + namespace is sought instead of `uservars`. It is also possible to pass + just some of the variables in expression via the `uservars` mapping, + and the rest will be retrieved from the current local and global + namespaces. + kwargs : dict + This is meant to pass additional parameters to the Numexpr kernel. + This is basically the same as the kwargs argument in + Numexpr.evaluate(), and is mainly meant for advanced use. + + Examples + -------- + The following shows an example of using Expr:: + + >>> f = tb.open_file('/tmp/test_expr.h5', 'w') + >>> a = f.create_array('/', 'a', np.array([1,2,3])) + >>> b = f.create_array('/', 'b', np.array([3,4,5])) + >>> c = np.array([4,5,6]) + >>> expr = tb.Expr("2 * a + b * c") # initialize the expression + >>> expr.eval() # evaluate it + array([14, 24, 36], dtype=int64) + >>> sum(expr) # use as an iterator + 74 + + where you can see that you can mix different containers in + the expression (whenever shapes are consistent). + + You can also work with multidimensional arrays:: + + >>> a2 = f.create_array('/', 'a2', np.array([[1,2],[3,4]])) + >>> b2 = f.create_array('/', 'b2', np.array([[3,4],[5,6]])) + >>> c2 = np.array([4,5]) # This will be broadcasted + >>> expr = tb.Expr("2 * a2 + b2-c2") + >>> expr.eval() + array([[1, 3], + [7, 9]], dtype=int64) + >>> sum(expr) + array([ 8, 12], dtype=int64) + >>> f.close() + + .. rubric:: Expr attributes + + .. attribute:: append_mode + + The append mode for user-provided output containers. + + .. attribute:: maindim + + Common main dimension for inputs in expression. + + .. attribute:: names + + The names of variables in expression (list). + + .. attribute:: out + + The user-provided container (if any) for the expression outcome. + + .. attribute:: o_start + + The start range selection for the user-provided output. + + .. attribute:: o_stop + + The stop range selection for the user-provided output. + + .. attribute:: o_step + + The step range selection for the user-provided output. + + .. attribute:: shape + + Common shape for the arrays in expression. + + .. attribute:: values + + The values of variables in expression (list). + + """ + + _exprvars_cache = {} + """Cache of variables participating in expressions. + + .. versionadded:: 3.0 + + """ + + def __init__(self, expr, uservars=None, **kwargs): + + self.append_mode = False + """The append mode for user-provided output containers.""" + self.maindim = 0 + """Common main dimension for inputs in expression.""" + self.names = [] + """The names of variables in expression (list).""" + self.out = None + """The user-provided container (if any) for the expression outcome.""" + self.o_start = None + """The start range selection for the user-provided output.""" + self.o_stop = None + """The stop range selection for the user-provided output.""" + self.o_step = None + """The step range selection for the user-provided output.""" + self.shape = None + """Common shape for the arrays in expression.""" + self.start, self.stop, self.step = (None,) * 3 + self.start = None + """The start range selection for the input.""" + self.stop = None + """The stop range selection for the input.""" + self.step = None + """The step range selection for the input.""" + self.values = [] + """The values of variables in expression (list).""" + + self._compiled_expr = None + """The compiled expression.""" + self._single_row_out = None + """A sample of the output with just a single row.""" + + # First, get the signature for the arrays in expression + vars_ = self._required_expr_vars(expr, uservars) + context = ne.necompiler.getContext(kwargs) + self.names, _ = ne.necompiler.getExprNames(expr, context) + + # Raise a ValueError in case we have unsupported objects + for name, var in vars_.items(): + if type(var) in (int, float, str): + continue + if not isinstance(var, (tb.Leaf, tb.Column)): + if hasattr(var, "dtype"): + # Quacks like a NumPy object + continue + raise TypeError("Unsupported variable type: %r" % var) + objname = var.__class__.__name__ + if objname not in ("Array", "CArray", "EArray", "Column"): + raise TypeError("Unsupported variable type: %r" % var) + + # NumPy arrays to be copied? (we don't need to worry about + # PyTables objects, as the reads always return contiguous and + # aligned objects, or at least I think so). + for name, var in vars_.items(): + if isinstance(var, np.ndarray): + # See numexpr.necompiler.evaluate for a rational + # of the code below + if not var.flags.aligned: + if var.ndim != 1: + # Do a copy of this variable + var = var.copy() + # Update the vars_ dictionary + vars_[name] = var + + # Get the variables and types + values = self.values + types_ = [] + for name in self.names: + value = vars_[name] + if hasattr(value, 'atom'): + types_.append(value.atom) + elif hasattr(value, 'dtype'): + types_.append(value) + else: + # try to convert into a NumPy array + value = np.array(value) + types_.append(value) + values.append(value) + + # Create a signature for the expression + signature = [(name, ne.necompiler.getType(type_)) + for (name, type_) in zip(self.names, types_)] + + # Compile the expression + self._compiled_expr = ne.necompiler.NumExpr(expr, signature, **kwargs) + + # Guess the shape for the outcome and the maindim of inputs + self.shape, self.maindim = self._guess_shape() + + # The next method is similar to their counterpart in `Table`, but + # adapted to the `Expr` own requirements. + def _required_expr_vars(self, expression, uservars, depth=2): + """Get the variables required by the `expression`. + + A new dictionary defining the variables used in the `expression` + is returned. Required variables are first looked up in the + `uservars` mapping, then in the set of top-level columns of the + table. Unknown variables cause a `NameError` to be raised. + + When `uservars` is `None`, the local and global namespace where + the API callable which uses this method is called is sought + instead. To disable this mechanism, just specify a mapping as + `uservars`. + + Nested columns and variables with an ``uint64`` type are not + allowed (`TypeError` and `NotImplementedError` are raised, + respectively). + + `depth` specifies the depth of the frame in order to reach local + or global variables. + + """ + + # Get the names of variables used in the expression. + exprvars_cache = self._exprvars_cache + if expression not in exprvars_cache: + # Protection against growing the cache too much + if len(exprvars_cache) > 256: + # Remove 10 (arbitrary) elements from the cache + for k in list(exprvars_cache)[:10]: + del exprvars_cache[k] + cexpr = compile(expression, '', 'eval') + exprvars = [var for var in cexpr.co_names + if var not in ['None', 'False', 'True'] + and var not in ne.expressions.functions] + exprvars_cache[expression] = exprvars + else: + exprvars = exprvars_cache[expression] + + # Get the local and global variable mappings of the user frame + # if no mapping has been explicitly given for user variables. + user_locals, user_globals = {}, {} + if uservars is None: + user_frame = sys._getframe(depth) + user_locals = user_frame.f_locals + user_globals = user_frame.f_globals + + # Look for the required variables first among the ones + # explicitly provided by the user. + reqvars = {} + for var in exprvars: + # Get the value. + if uservars is not None and var in uservars: + val = uservars[var] + elif uservars is None and var in user_locals: + val = user_locals[var] + elif uservars is None and var in user_globals: + val = user_globals[var] + else: + raise NameError("name ``%s`` is not defined" % var) + + # Check the value. + if hasattr(val, 'dtype') and val.dtype.str[1:] == 'u8': + raise NotImplementedError( + "variable ``%s`` refers to " + "a 64-bit unsigned integer object, that is " + "not yet supported in expressions, sorry; " % var) + elif hasattr(val, '_v_colpathnames'): # nested column + # This branch is never reached because the compile step + # above already raise a ``TypeError`` for nested + # columns, but that could change in the future. So it + # is best to let this here. + raise TypeError( + "variable ``%s`` refers to a nested column, " + "not allowed in expressions" % var) + reqvars[var] = val + return reqvars + + def set_inputs_range(self, start=None, stop=None, step=None): + """Define a range for all inputs in expression. + + The computation will only take place for the range defined by + the start, stop and step parameters in the main dimension of + inputs (or the leading one, if the object lacks the concept of + main dimension, like a NumPy container). If not a common main + dimension exists for all inputs, the leading dimension will be + used instead. + + """ + + self.start = start + self.stop = stop + self.step = step + + def set_output(self, out, append_mode=False): + """Set out as container for output as well as the append_mode. + + The out must be a container that is meant to keep the outcome of + the expression. It should be an homogeneous type container and + can typically be an Array, CArray, EArray, Column or a NumPy ndarray. + + The append_mode specifies the way of which the output is filled. + If true, the rows of the outcome are *appended* to the out container. + Of course, for doing this it is necessary that out would have an + append() method (like an EArray, for example). + + If append_mode is false, the output is set via the __setitem__() + method (see the Expr.set_output_range() for info on how to select + the rows to be updated). If out is smaller than what is required + by the expression, only the computations that are needed to fill + up the container are carried out. If it is larger, the excess + elements are unaffected. + + """ + + if not (hasattr(out, "shape") and hasattr(out, "__setitem__")): + raise ValueError( + "You need to pass a settable multidimensional container " + "as output") + self.out = out + if append_mode and not hasattr(out, "append"): + raise ValueError( + "For activating the ``append`` mode, you need a container " + "with an `append()` method (like the `EArray`)") + self.append_mode = append_mode + + def set_output_range(self, start=None, stop=None, step=None): + """Define a range for user-provided output object. + + The output object will only be modified in the range specified by the + start, stop and step parameters in the main dimension of output (or the + leading one, if the object does not have the concept of main dimension, + like a NumPy container). + + """ + + if self.out is None: + raise IndexError( + "You need to pass an output object to `setOut()` first") + self.o_start = start + self.o_stop = stop + self.o_step = step + + # Although the next code is similar to the method in `Leaf`, it + # allows the use of pure NumPy objects. + def _calc_nrowsinbuf(self, object_): + """Calculate the number of rows that will fit in a buffer.""" + + # Compute the rowsize for the *leading* dimension + shape_ = list(object_.shape) + if shape_: + shape_[0] = 1 + + rowsize = np.prod(shape_) * object_.dtype.itemsize + + # Compute the nrowsinbuf + # Multiplying the I/O buffer size by 4 gives optimal results + # in my benchmarks with `tables.Expr` (see ``bench/poly.py``) + buffersize = IO_BUFFER_SIZE * 4 + nrowsinbuf = buffersize // rowsize + + # Safeguard against row sizes being extremely large + if nrowsinbuf == 0: + nrowsinbuf = 1 + # If rowsize is too large, issue a Performance warning + maxrowsize = BUFFER_TIMES * buffersize + if rowsize > maxrowsize: + warnings.warn("""\ +The object ``%s`` is exceeding the maximum recommended rowsize (%d +bytes); be ready to see PyTables asking for *lots* of memory and +possibly slow I/O. You may want to reduce the rowsize by trimming the +value of dimensions that are orthogonal (and preferably close) to the +*leading* dimension of this object.""" + % (object, maxrowsize), + PerformanceWarning) + + return nrowsinbuf + + def _guess_shape(self): + """Guess the shape of the output of the expression.""" + + # First, compute the maximum dimension of inputs and maindim + # (if it exists) + maxndim = 0 + maindims = [] + for val in self.values: + # Get the minimum of the lengths + if len(val.shape) > maxndim: + maxndim = len(val.shape) + if hasattr(val, "maindim"): + maindims.append(val.maindim) + if maxndim == 0: + self._single_row_out = out = self._compiled_expr(*self.values) + return (), None + if maindims and [maindims[0]] * len(maindims) == maindims: + # If all maindims detected are the same, use this as maindim + maindim = maindims[0] + else: + # If not, the main dimension will be the default one + maindim = 0 + + # The slices parameter for inputs + slices = (slice(None),) * maindim + (0,) + + # Now, collect the values in first row of arrays with maximum dims + vals = [] + lens = [] + for val in self.values: + shape = val.shape + # Warning: don't use len(val) below or it will raise an + # `Overflow` error on 32-bit platforms for large enough arrays. + if shape != () and shape[maindim] == 0: + vals.append(val[:]) + lens.append(0) + elif len(shape) < maxndim: + vals.append(val) + else: + vals.append(val.__getitem__(slices)) + lens.append(shape[maindim]) + minlen = min(lens) + self._single_row_out = out = self._compiled_expr(*vals) + shape = list(out.shape) + if minlen > 0: + shape.insert(maindim, minlen) + return shape, maindim + + def _get_info(self, shape, maindim, itermode=False): + """Return various info needed for evaluating the computation loop.""" + + # Compute the shape of the resulting container having + # in account new possible values of start, stop and step in + # the inputs range + if maindim is not None: + (start, stop, step) = slice( + self.start, self.stop, self.step).indices(shape[maindim]) + shape[maindim] = min( + shape[maindim], len(range(start, stop, step))) + i_nrows = shape[maindim] + else: + start, stop, step = 0, 0, None + i_nrows = 0 + + if not itermode: + # Create a container for output if not defined yet + o_maindim = 0 # Default maindim + if self.out is None: + out = np.empty(shape, dtype=self._single_row_out.dtype) + # Get the trivial values for start, stop and step + if maindim is not None: + (o_start, o_stop, o_step) = (0, shape[maindim], 1) + else: + (o_start, o_stop, o_step) = (0, 0, 1) + else: + out = self.out + # Out container already provided. Do some sanity checks. + if hasattr(out, "maindim"): + o_maindim = out.maindim + + # Refine the shape of the resulting container having in + # account new possible values of start, stop and step in + # the output range + o_shape = list(out.shape) + s = slice(self.o_start, self.o_stop, self.o_step) + o_start, o_stop, o_step = s.indices(o_shape[o_maindim]) + o_shape[o_maindim] = min(o_shape[o_maindim], + len(range(o_start, o_stop, o_step))) + + # Check that the shape of output is consistent with inputs + tr_oshape = list(o_shape) # this implies a copy + olen_ = tr_oshape.pop(o_maindim) + tr_shape = list(shape) # do a copy + if maindim is not None: + len_ = tr_shape.pop(o_maindim) + else: + len_ = 1 + if tr_oshape != tr_shape: + raise ValueError( + "Shape for out container does not match expression") + # Force the input length to fit in `out` + if not self.append_mode and olen_ < len_: + shape[o_maindim] = olen_ + stop = start + olen_ + + # Get the positions of inputs that should be sliced (the others + # will be broadcasted) + ndim = len(shape) + slice_pos = [i for i, val in enumerate(self.values) + if len(val.shape) == ndim] + + # The size of the I/O buffer + nrowsinbuf = 1 + for i, val in enumerate(self.values): + # Skip scalar values in variables + if i in slice_pos: + nrows = self._calc_nrowsinbuf(val) + if nrows > nrowsinbuf: + nrowsinbuf = nrows + + if not itermode: + return (i_nrows, slice_pos, start, stop, step, nrowsinbuf, + out, o_maindim, o_start, o_stop, o_step) + else: + # For itermode, we don't need the out info + return (i_nrows, slice_pos, start, stop, step, nrowsinbuf) + + def eval(self): + """Evaluate the expression and return the outcome. + + Because of performance reasons, the computation order tries to go along + the common main dimension of all inputs. If not such a common main + dimension is found, the iteration will go along the leading dimension + instead. + + For non-consistent shapes in inputs (i.e. shapes having a different + number of dimensions), the regular NumPy broadcast rules applies. + There is one exception to this rule though: when the dimensions + orthogonal to the main dimension of the expression are consistent, but + the main dimension itself differs among the inputs, then the shortest + one is chosen for doing the computations. This is so because trying to + expand very large on-disk arrays could be too expensive or simply not + possible. + + Also, the regular Numexpr casting rules (which are similar to those of + NumPy, although you should check the Numexpr manual for the exceptions) + are applied to determine the output type. + + Finally, if the setOuput() method specifying a user container has + already been called, the output is sent to this user-provided + container. If not, a fresh NumPy container is returned instead. + + .. warning:: + + When dealing with large on-disk inputs, failing to specify an + on-disk container may consume all your available memory. + + """ + + values, shape, maindim = self.values, self.shape, self.maindim + + # Get different info we need for the main computation loop + (i_nrows, slice_pos, start, stop, step, nrowsinbuf, + out, o_maindim, o_start, o_stop, o_step) = \ + self._get_info(shape, maindim) + + if i_nrows == 0: + # No elements to compute + if start >= stop and self.start is not None: + return out + else: + return self._single_row_out + + # Create a key that selects every element in inputs and output + # (including the main dimension) + i_slices = [slice(None)] * (maindim + 1) + o_slices = [slice(None)] * (o_maindim + 1) + + # This is a hack to prevent doing unnecessary flavor conversions + # while reading buffers + for val in values: + if hasattr(val, 'maindim'): + val._v_convert = False + + # Start the computation itself + for start2 in range(start, stop, step * nrowsinbuf): + stop2 = start2 + step * nrowsinbuf + if stop2 > stop: + stop2 = stop + # Set the proper slice for inputs + i_slices[maindim] = slice(start2, stop2, step) + # Get the input values + vals = [] + for i, val in enumerate(values): + if i in slice_pos: + vals.append(val.__getitem__(tuple(i_slices))) + else: + # A read of values is not apparently needed, as PyTables + # leaves seems to work just fine inside Numexpr + vals.append(val) + # Do the actual computation for this slice + rout = self._compiled_expr(*vals) + # Set the values into the out buffer + if self.append_mode: + out.append(rout) + else: + # Compute the slice to be filled in output + start3 = o_start + (start2 - start) // step + stop3 = start3 + nrowsinbuf * o_step + if stop3 > o_stop: + stop3 = o_stop + o_slices[o_maindim] = slice(start3, stop3, o_step) + # Set the slice + out[tuple(o_slices)] = rout + + # Activate the conversion again (default) + for val in values: + if hasattr(val, 'maindim'): + val._v_convert = True + + return out + + def __iter__(self): + """Iterate over the rows of the outcome of the expression. + + This iterator always returns rows as NumPy objects, so a possible out + container specified in :meth:`Expr.set_output` method is ignored here. + + """ + + values, shape, maindim = self.values, self.shape, self.maindim + + # Get different info we need for the main computation loop + (i_nrows, slice_pos, start, stop, step, nrowsinbuf) = \ + self._get_info(shape, maindim, itermode=True) + + if i_nrows == 0: + # No elements to compute + return + + # Create a key that selects every element in inputs + # (including the main dimension) + i_slices = [slice(None)] * (maindim + 1) + + # This is a hack to prevent doing unnecessary flavor conversions + # while reading buffers + for val in values: + if hasattr(val, 'maindim'): + val._v_convert = False + + # Start the computation itself + for start2 in range(start, stop, step * nrowsinbuf): + stop2 = start2 + step * nrowsinbuf + if stop2 > stop: + stop2 = stop + # Set the proper slice in the main dimension + i_slices[maindim] = slice(start2, stop2, step) + # Get the values for computing the buffer + vals = [] + for i, val in enumerate(values): + if i in slice_pos: + vals.append(val.__getitem__(tuple(i_slices))) + else: + # A read of values is not apparently needed, as PyTables + # leaves seems to work just fine inside Numexpr + vals.append(val) + # Do the actual computation + rout = self._compiled_expr(*vals) + # Return one row per call + yield from rout + + # Activate the conversion again (default) + for val in values: + if hasattr(val, 'maindim'): + val._v_convert = True + + +if __name__ == "__main__": + + # shape = (10000,10000) + shape = (10, 10_000) + + f = tb.open_file("/tmp/expression.h5", "w") + + # Create some arrays + a = f.create_carray(f.root, 'a', atom=tb.Float32Atom(dflt=1), shape=shape) + b = f.create_carray(f.root, 'b', atom=tb.Float32Atom(dflt=2), shape=shape) + c = f.create_carray(f.root, 'c', atom=tb.Float32Atom(dflt=3), shape=shape) + out = f.create_carray(f.root, 'out', atom=tb.Float32Atom(dflt=3), + shape=shape) + + expr = Expr("a * b + c") + expr.set_output(out) + d = expr.eval() + + print("returned-->", repr(d)) + # print(`d[:]`) + + f.close() diff --git a/tables/file.py b/tables/file.py new file mode 100644 index 0000000..0684f58 --- /dev/null +++ b/tables/file.py @@ -0,0 +1,2781 @@ +"""Create PyTables files and the object tree. + +This module support importing generic HDF5 files, on top of which +PyTables files are created, read or extended. If a file exists, an +object tree mirroring their hierarchical structure is created in memory. +File class offer methods to traverse the tree, as well as to create new +nodes. + +""" + +import atexit +import datetime +import os +import sys +import weakref +import warnings +from collections import defaultdict +from pathlib import Path + +import numexpr as ne +import numpy as np + +from . import hdf5extension +from . import utilsextension +from . import parameters +from .exceptions import (ClosedFileError, FileModeError, NodeError, + NoSuchNodeError, UndoRedoError, ClosedNodeError, + PerformanceWarning) +from .registry import get_class_by_name +from .path import join_path, split_path +from . import undoredo +from .description import (IsDescription, UInt8Col, StringCol, + descr_from_dtype, dtype_from_descr) +from .filters import Filters +from .node import Node, NotLoggedMixin +from .group import Group, RootGroup +from .group import TransactionGroupG, TransactionG, MarkG +from .leaf import Leaf +from .array import Array +from .carray import CArray +from .earray import EArray +from .vlarray import VLArray +from .table import Table +from . import linkextension +from .utils import detect_number_of_cores +from . import lrucacheextension +from .flavor import flavor_of, array_as_internal +from .atom import Atom + +from .link import SoftLink, ExternalLink + + +# format_version = "1.0" # Initial format +# format_version = "1.1" # Changes in ucl compression +# format_version = "1.2" # Support for enlargeable arrays and VLA's +# # 1.2 was introduced in PyTables 0.8 +# format_version = "1.3" # Support for indexes in Tables +# # 1.3 was introduced in PyTables 0.9 +# format_version = "1.4" # Support for multidimensional attributes +# # 1.4 was introduced in PyTables 1.1 +# format_version = "1.5" # Support for persistent defaults in tables +# # 1.5 was introduced in PyTables 1.2 +# format_version = "1.6" # Support for NumPy objects and new flavors for +# # objects. +# # 1.6 was introduced in pytables 1.3 +# format_version = "2.0" # Pickles are not used anymore in system attrs +# # 2.0 was introduced in PyTables 2.0 +format_version = "2.1" # Numeric and numarray flavors are gone. + +compatible_formats = [] # Old format versions we can read +# # Empty means that we support all the old formats + + +class _FileRegistry: + def __init__(self): + self._name_mapping = defaultdict(set) + self._handlers = set() + + @property + def filenames(self): + return list(self._name_mapping) + + @property + def handlers(self): + # return set(self._handlers) # return a copy + return self._handlers + + def __len__(self): + return len(self._handlers) + + def __contains__(self, filename): + return filename in self.filenames + + def add(self, handler): + self._name_mapping[handler.filename].add(handler) + self._handlers.add(handler) + + def remove(self, handler): + filename = handler.filename + self._name_mapping[filename].remove(handler) + # remove enpty keys + if not self._name_mapping[filename]: + del self._name_mapping[filename] + self._handlers.remove(handler) + + def get_handlers_by_name(self, filename): + # return set(self._name_mapping[filename]) # return a copy + return self._name_mapping[filename] + + def close_all(self): + are_open_files = len(self._handlers) > 0 + if are_open_files: + sys.stderr.write("Closing remaining open files:") + handlers = list(self._handlers) # make a copy + for fileh in handlers: + sys.stderr.write("%s..." % fileh.filename) + fileh.close() + sys.stderr.write("done") + if are_open_files: + sys.stderr.write("\n") + + +# Dict of opened files (keys are filenames and values filehandlers) +_open_files = _FileRegistry() + +# Opcodes for do-undo actions +_op_to_code = { + "MARK": 0, + "CREATE": 1, + "REMOVE": 2, + "MOVE": 3, + "ADDATTR": 4, + "DELATTR": 5, +} + +_code_to_op = ["MARK", "CREATE", "REMOVE", "MOVE", "ADDATTR", "DELATTR"] + + +# Paths and names for hidden nodes related with transactions. +_trans_version = '1.0' + +_trans_group_parent = '/' +_trans_group_name = '_p_transactions' +_trans_group_path = join_path(_trans_group_parent, _trans_group_name) + +_action_log_parent = _trans_group_path +_action_log_name = 'actionlog' +_action_log_path = join_path(_action_log_parent, _action_log_name) + +_trans_parent = _trans_group_path +_trans_name = 't%d' # %d -> transaction number +_trans_path = join_path(_trans_parent, _trans_name) + +_markParent = _trans_path +_markName = 'm%d' # %d -> mark number +_markPath = join_path(_markParent, _markName) + +_shadow_parent = _markPath +_shadow_name = 'a%d' # %d -> action number +_shadow_path = join_path(_shadow_parent, _shadow_name) + + +def _checkfilters(filters): + if not (filters is None or + isinstance(filters, Filters)): + raise TypeError("filter parameter has to be None or a Filter " + "instance and the passed type is: '%s'" % + type(filters)) + + +def copy_file(srcfilename, dstfilename, overwrite=False, **kwargs): + """An easy way of copying one PyTables file to another. + + This function allows you to copy an existing PyTables file named + srcfilename to another file called dstfilename. The source file + must exist and be readable. The destination file can be + overwritten in place if existing by asserting the overwrite + argument. + + This function is a shorthand for the :meth:`File.copy_file` method, + which acts on an already opened file. kwargs takes keyword + arguments used to customize the copying process. See the + documentation of :meth:`File.copy_file` for a description of those + arguments. + + """ + + # Open the source file. + srcfileh = open_file(srcfilename, mode="r") + + try: + # Copy it to the destination file. + srcfileh.copy_file(dstfilename, overwrite=overwrite, **kwargs) + finally: + # Close the source file. + srcfileh.close() + + +hdf5_version_str = utilsextension.get_hdf5_version() +hdf5_version_tup = tuple(map(int, hdf5_version_str.split('-')[0].split('.'))) +_FILE_OPEN_POLICY = 'strict' if hdf5_version_tup < (1, 8, 7) else 'default' + + +def open_file(filename, mode="r", title="", root_uep="/", filters=None, + **kwargs): + """Open a PyTables (or generic HDF5) file and return a File object. + + Parameters + ---------- + filename : str + The name of the file (supports environment variable expansion). + It is suggested that file names have any of the .h5, .hdf or + .hdf5 extensions, although this is not mandatory. + mode : str + The mode to open the file. It can be one of the + following: + + * *'r'*: Read-only; no data can be modified. + * *'w'*: Write; a new file is created (an existing file + with the same name would be deleted). + * *'a'*: Append; an existing file is opened for reading and + writing, and if the file does not exist it is created. + * *'r+'*: It is similar to 'a', but the file must already + exist. + + title : str + If the file is to be created, a TITLE string attribute will be + set on the root group with the given value. Otherwise, the + title will be read from disk, and this will not have any effect. + root_uep : str + The root User Entry Point. This is a group in the HDF5 hierarchy + which will be taken as the starting point to create the object + tree. It can be whatever existing group in the file, named by + its HDF5 path. If it does not exist, an HDF5ExtError is issued. + Use this if you do not want to build the *entire* object tree, + but rather only a *subtree* of it. + + .. versionchanged:: 3.0 + The *rootUEP* parameter has been renamed into *root_uep*. + + filters : Filters + An instance of the Filters (see :ref:`FiltersClassDescr`) class + that provides information about the desired I/O filters + applicable to the leaves that hang directly from the *root group*, + unless other filter properties are specified for these leaves. + Besides, if you do not specify filter properties for child groups, + they will inherit these ones, which will in turn propagate to + child nodes. + + Notes + ----- + In addition, it recognizes the (lowercase) names of parameters + present in :file:`tables/parameters.py` as additional keyword + arguments. + See :ref:`parameter_files` for a detailed info on the supported + parameters. + + .. note:: + + If you need to deal with a large number of nodes in an + efficient way, please see :ref:`LRUOptim` for more info and + advices about the integrated node cache engine. + + """ + filename = os.fspath(filename) + # XXX filename normalization ?? + + # Check already opened files + if _FILE_OPEN_POLICY == 'strict': + # This policy do not allows to open the same file multiple times + # even in read-only mode + if filename in _open_files: + raise ValueError( + "The file '%s' is already opened. " + "Please close it before reopening. " + "HDF5 v.%s, FILE_OPEN_POLICY = '%s'" % ( + filename, utilsextension.get_hdf5_version(), + _FILE_OPEN_POLICY)) + else: + for filehandle in _open_files.get_handlers_by_name(filename): + omode = filehandle.mode + # 'r' is incompatible with everything except 'r' itself + if mode == 'r' and omode != 'r': + raise ValueError( + "The file '%s' is already opened, but " + "not in read-only mode (as requested)." % filename) + # 'a' and 'r+' are compatible with everything except 'r' + elif mode in ('a', 'r+') and omode == 'r': + raise ValueError( + "The file '%s' is already opened, but " + "in read-only mode. Please close it before " + "reopening in append mode." % filename) + # 'w' means that we want to destroy existing contents + elif mode == 'w': + raise ValueError( + "The file '%s' is already opened. Please " + "close it before reopening in write mode." % filename) + + # Finally, create the File instance, and return it + return File(filename, mode, title, root_uep, filters, **kwargs) + + +# A dumb class that doesn't keep nothing at all +class _NoCache: + def __len__(self): + return 0 + + def __contains__(self, key): + return False + + def __iter__(self): + return iter([]) + + def __setitem__(self, key, value): + pass + + __marker = object() + + def pop(self, key, d=__marker): + if d is not self.__marker: + return d + raise KeyError(key) + + +class _DictCache(dict): + def __init__(self, nslots): + if nslots < 1: + raise ValueError("Invalid number of slots: %d" % nslots) + self.nslots = nslots + super().__init__() + + def __setitem__(self, key, value): + # Check if we are running out of space + if len(self) > self.nslots: + warnings.warn( + "the dictionary of node cache is exceeding the recommended " + "maximum number (%d); be ready to see PyTables asking for " + "*lots* of memory and possibly slow I/O." % ( + self.nslots), PerformanceWarning) + super().__setitem__(key, value) + + +class NodeManager: + def __init__(self, nslots=64, node_factory=None): + super().__init__() + + self.registry = weakref.WeakValueDictionary() + + if nslots > 0: + cache = lrucacheextension.NodeCache(nslots) + elif nslots == 0: + cache = _NoCache() + else: + # nslots < 0 + cache = _DictCache(-nslots) + + self.cache = cache + + # node_factory(node_path) + self.node_factory = node_factory + + def register_node(self, node, key): + if key is None: + key = node._v_pathname + + if key in self.registry: + if not self.registry[key]._v_isopen: + del self.registry[key] + self.registry[key] = node + elif self.registry[key] is not node: + raise RuntimeError('trying to register a node with an ' + 'existing key: ``%s``' % key) + else: + self.registry[key] = node + + def cache_node(self, node, key=None): + if key is None: + key = node._v_pathname + + self.register_node(node, key) + if key in self.cache: + oldnode = self.cache.pop(key) + if oldnode is not node and oldnode._v_isopen: + raise RuntimeError('trying to cache a node with an ' + 'existing key: ``%s``' % key) + + self.cache[key] = node + + def get_node(self, key): + node = self.cache.pop(key, None) + if node is not None: + if node._v_isopen: + self.cache_node(node, key) + return node + else: + # this should not happen + warnings.warn("a closed node found in the cache: ``%s``" % key) + + if key in self.registry: + node = self.registry[key] + if node is None: + # this should not happen since WeakValueDictionary drops all + # dead weakrefs + warnings.warn("None is stored in the registry for key: " + "``%s``" % key) + elif node._v_isopen: + self.cache_node(node, key) + return node + else: + # this should not happen + warnings.warn("a closed node found in the registry: " + "``%s``" % key) + del self.registry[key] + node = None + + if self.node_factory: + node = self.node_factory(key) + self.cache_node(node, key) + + return node + + def rename_node(self, oldkey, newkey): + for cache in (self.cache, self.registry): + if oldkey in cache: + node = cache.pop(oldkey) + cache[newkey] = node + + def drop_from_cache(self, nodepath): + """Remove the node from cache""" + + # Remove the node from the cache. + self.cache.pop(nodepath, None) + + def drop_node(self, node, check_unregistered=True): + """Drop the `node`. + + Remove the node from the cache and, if it has no more references, + close it. + + """ + + # Remove all references to the node. + nodepath = node._v_pathname + + self.drop_from_cache(nodepath) + + if nodepath in self.registry: + if not node._v_isopen: + del self.registry[nodepath] + elif check_unregistered: + # If the node is not in the registry (this should never happen) + # we close it forcibly since it is not ensured that the __del__ + # method is called for object that are still alive when the + # interpreter is shut down + if node._v_isopen: + warnings.warn("dropping a node that is not in the registry: " + "``%s``" % nodepath) + + node._g_pre_kill_hook() + node._f_close() + + def flush_nodes(self): + # Only iter on the nodes in the registry since nodes in the cahce + # should always have an entry in the registry + closed_keys = [] + for path, node in list(self.registry.items()): + if not node._v_isopen: + closed_keys.append(path) + elif '/_i_' not in path: # Indexes are not necessary to be flushed + if isinstance(node, Leaf): + node.flush() + + for path in closed_keys: + # self.cache.pop(path, None) + if path in self.cache: + warnings.warn("closed node the cache: ``%s``" % path) + self.cache.pop(path, None) + self.registry.pop(path) + + @staticmethod + def _close_nodes(nodepaths, get_node): + for nodepath in nodepaths: + try: + node = get_node(nodepath) + except KeyError: + pass + else: + if not node._v_isopen or node._v__deleting: + continue + + try: + # Avoid descendent nodes to also iterate over + # their descendents, which are already to be + # closed by this loop. + if hasattr(node, '_f_get_child'): + node._g_close() + else: + node._f_close() + del node + except ClosedNodeError: + # import traceback + # type_, value, tb = sys.exc_info() + # exception_dump = ''.join( + # traceback.format_exception(type_, value, tb)) + # warnings.warn( + # "A '%s' exception occurred trying to close a node " + # "that was supposed to be open.\n" + # "%s" % (type_.__name__, exception_dump)) + pass + + def close_subtree(self, prefix='/'): + if not prefix.endswith('/'): + prefix = prefix + '/' + + cache = self.cache + registry = self.registry + + # Ensure tables are closed before their indices + paths = [ + path for path in cache + if path.startswith(prefix) and '/_i_' not in path + ] + self._close_nodes(paths, cache.pop) + + # Close everything else (i.e. indices) + paths = [path for path in cache if path.startswith(prefix)] + self._close_nodes(paths, cache.pop) + + # Ensure tables are closed before their indices + paths = [ + path for path in registry + if path.startswith(prefix) and '/_i_' not in path + ] + self._close_nodes(paths, registry.pop) + + # Close everything else (i.e. indices) + paths = [path for path in registry if path.startswith(prefix)] + self._close_nodes(paths, registry.pop) + + def shutdown(self): + registry = self.registry + cache = self.cache + + # self.close_subtree('/') + + keys = list(cache) # copy + for key in keys: + node = cache.pop(key) + if node._v_isopen: + registry.pop(node._v_pathname, None) + node._f_close() + + while registry: + key, node = registry.popitem() + if node._v_isopen: + node._f_close() + + +class File(hdf5extension.File): + """The in-memory representation of a PyTables file. + + An instance of this class is returned when a PyTables file is + opened with the :func:`tables.open_file` function. It offers methods + to manipulate (create, rename, delete...) nodes and handle their + attributes, as well as methods to traverse the object tree. + The *user entry point* to the object tree attached to the HDF5 file + is represented in the root_uep attribute. + Other attributes are available. + + File objects support an *Undo/Redo mechanism* which can be enabled + with the :meth:`File.enable_undo` method. Once the Undo/Redo + mechanism is enabled, explicit *marks* (with an optional unique + name) can be set on the state of the database using the + :meth:`File.mark` + method. There are two implicit marks which are always available: + the initial mark (0) and the final mark (-1). Both the identifier + of a mark and its name can be used in *undo* and *redo* operations. + + Hierarchy manipulation operations (node creation, movement and + removal) and attribute handling operations (setting and deleting) + made after a mark can be undone by using the :meth:`File.undo` + method, which returns the database to the state of a past mark. + If undo() is not followed by operations that modify the hierarchy + or attributes, the :meth:`File.redo` method can be used to return + the database to the state of a future mark. Else, future states of + the database are forgotten. + + Note that data handling operations can not be undone nor redone by + now. Also, hierarchy manipulation operations on nodes that do not + support the Undo/Redo mechanism issue an UndoRedoWarning *before* + changing the database. + + The Undo/Redo mechanism is persistent between sessions and can + only be disabled by calling the :meth:`File.disable_undo` method. + + File objects can also act as context managers when using the with + statement introduced in Python 2.5. When exiting a context, the + file is automatically closed. + + Parameters + ---------- + filename : str + The name of the file (supports environment variable expansion). + It is suggested that file names have any of the .h5, .hdf or + .hdf5 extensions, although this is not mandatory. + + mode : str + The mode to open the file. It can be one of the + following: + + * *'r'*: Read-only; no data can be modified. + * *'w'*: Write; a new file is created (an existing file + with the same name would be deleted). + * *'a'*: Append; an existing file is opened for reading + and writing, and if the file does not exist it is created. + * *'r+'*: It is similar to 'a', but the file must already + exist. + + title : str + If the file is to be created, a TITLE string attribute will be + set on the root group with the given value. Otherwise, the + title will be read from disk, and this will not have any effect. + + root_uep : str + The root User Entry Point. This is a group in the HDF5 hierarchy + which will be taken as the starting point to create the object + tree. It can be whatever existing group in the file, named by + its HDF5 path. If it does not exist, an HDF5ExtError is issued. + Use this if you do not want to build the *entire* object tree, + but rather only a *subtree* of it. + + .. versionchanged:: 3.0 + The *rootUEP* parameter has been renamed into *root_uep*. + + filters : Filters + An instance of the Filters (see :ref:`FiltersClassDescr`) class that + provides information about the desired I/O filters applicable to the + leaves that hang directly from the *root group*, unless other filter + properties are specified for these leaves. Besides, if you do not + specify filter properties for child groups, they will inherit these + ones, which will in turn propagate to child nodes. + + Notes + ----- + In addition, it recognizes the (lowercase) names of parameters + present in :file:`tables/parameters.py` as additional keyword + arguments. + See :ref:`parameter_files` for a detailed info on the supported + parameters. + + + .. rubric:: File attributes + + .. attribute:: filename + + The name of the opened file. + + .. attribute:: format_version + + The PyTables version number of this file. + + .. attribute:: isopen + + True if the underlying file is open, false otherwise. + + .. attribute:: mode + + The mode in which the file was opened. + + .. attribute:: root + + The *root* of the object tree hierarchy (a Group instance). + + .. attribute:: root_uep + + The UEP (user entry point) group name in the file (see + the :func:`open_file` function). + + .. versionchanged:: 3.0 + The *rootUEP* attribute has been renamed into *root_uep*. + + """ + + # The top level kinds. Group must go first! + _node_kinds = ('Group', 'Leaf', 'Link', 'Unknown') + + @property + def title(self): + """The title of the root group in the file.""" + return self.root._v_title + + @title.setter + def title(self, title): + self.root._v_title = title + + @title.deleter + def title(self): + del self.root._v_title + + @property + def filters(self): + """Default filter properties for the root group + (see :ref:`FiltersClassDescr`).""" + return self.root._v_filters + + @filters.setter + def filters(self, filters): + self.root._v_filters = filters + + @filters.deleter + def filters(self): + del self.root._v_filters + + def __init__(self, filename, mode="r", title="", + root_uep="/", filters=None, **kwargs): + + self.filename = os.fspath(filename) + """The name of the opened file.""" + + self.mode = mode + """The mode in which the file was opened.""" + + if mode not in ('r', 'r+', 'a', 'w'): + raise ValueError("invalid mode string ``%s``. Allowed modes are: " + "'r', 'r+', 'a' and 'w'" % mode) + + # Get all the parameters in parameter file(s) + params = {k: v for k, v in parameters.__dict__.items() + if k.isupper() and not k.startswith('_')} + # Update them with possible keyword arguments + if [k for k in kwargs if k.isupper()]: + warnings.warn("The use of uppercase keyword parameters is " + "deprecated", DeprecationWarning) + + kwargs = {k.upper(): v for k, v in kwargs.items()} + params.update(kwargs) + + # If MAX_ * _THREADS is not set yet, set it to the number of cores + # on this machine. + + if params['MAX_NUMEXPR_THREADS'] is None: + params['MAX_NUMEXPR_THREADS'] = detect_number_of_cores() + + if params['MAX_BLOSC_THREADS'] is None: + params['MAX_BLOSC_THREADS'] = detect_number_of_cores() + + self.params = params + + # Now, it is time to initialize the File extension + self._g_new(filename, mode, **params) + + # Check filters and set PyTables format version for new files. + new = self._v_new + if new: + _checkfilters(filters) + self.format_version = format_version + """The PyTables version number of this file.""" + + # The node manager must be initialized before the root group + # initialization but the node_factory attribute is set onl later + # because it is a bount method of the root grop itself. + node_cache_slots = params['NODE_CACHE_SLOTS'] + self._node_manager = NodeManager(nslots=node_cache_slots) + + # For the moment Undo/Redo is not enabled. + self._undoEnabled = False + + # Set the flag to indicate that the file has been opened. + # It must be set before opening the root group + # to allow some basic access to its attributes. + self.isopen = 1 + """True if the underlying file os open, False otherwise.""" + + # Append the name of the file to the global dict of files opened. + _open_files.add(self) + + # Set the number of times this file has been opened to 1 + self._open_count = 1 + + # Get the root group from this file + self.root = root = self.__get_root_group(root_uep, title, filters) + """The *root* of the object tree hierarchy (a Group instance).""" + # Complete the creation of the root node + # (see the explanation in ``RootGroup.__init__()``. + root._g_post_init_hook() + self._node_manager.node_factory = self.root._g_load_child + + # Save the PyTables format version for this file. + if new: + if params['PYTABLES_SYS_ATTRS']: + root._v_attrs._g__setattr( + 'PYTABLES_FORMAT_VERSION', format_version) + + # If the file is old, and not opened in "read-only" mode, + # check if it has a transaction log + if not new and self.mode != "r" and _trans_group_path in self: + # It does. Enable the undo. + self.enable_undo() + + # Set the maximum number of threads for Numexpr + ne.set_vml_num_threads(params['MAX_NUMEXPR_THREADS']) + + def __get_root_group(self, root_uep, title, filters): + """Returns a Group instance which will act as the root group in the + hierarchical tree. + + If file is opened in "r", "r+" or "a" mode, and the file already + exists, this method dynamically builds a python object tree + emulating the structure present on file. + + """ + + self._v_objectid = self._get_file_id() + + if root_uep in [None, ""]: + root_uep = "/" + # Save the User Entry Point in a variable class + self.root_uep = root_uep + + new = self._v_new + + # Get format version *before* getting the object tree + if not new: + # Firstly, get the PyTables format version for this file + self.format_version = utilsextension.read_f_attr( + self._v_objectid, 'PYTABLES_FORMAT_VERSION') + if not self.format_version: + # PYTABLES_FORMAT_VERSION attribute is not present + self.format_version = "unknown" + self._isPTFile = False + elif not isinstance(self.format_version, str): + # system attributes should always be str + self.format_version = self.format_version.decode('utf-8') + + # Create new attributes for the root Group instance and + # create the object tree + return RootGroup(self, root_uep, title=title, new=new, filters=filters) + + def _get_or_create_path(self, path, create): + """Get the given `path` or create it if `create` is true. + + If `create` is true, `path` *must* be a string path and not a + node, otherwise a `TypeError`will be raised. + + """ + + if create: + return self._create_path(path) + else: + return self.get_node(path) + + def _create_path(self, path): + """Create the groups needed for the `path` to exist. + + The group associated with the given `path` is returned. + + """ + + if not hasattr(path, 'split'): + raise TypeError("when creating parents, parent must be a path") + + if path == '/': + return self.root + + parent, create_group = self.root, self.create_group + for pcomp in path.split('/')[1:]: + try: + child = parent._f_get_child(pcomp) + except NoSuchNodeError: + child = create_group(parent, pcomp) + parent = child + return parent + + def create_group(self, where, name, title="", filters=None, + createparents=False): + """Create a new group. + + Parameters + ---------- + where : str or Group + The parent group from which the new group will hang. It can be a + path string (for example '/level1/leaf5'), or a Group instance + (see :ref:`GroupClassDescr`). + name : str + The name of the new group. + title : str, optional + A description for this node (it sets the TITLE HDF5 attribute on + disk). + filters : Filters + An instance of the Filters class (see :ref:`FiltersClassDescr`) + that provides information about the desired I/O filters applicable + to the leaves that hang directly from this new group (unless other + filter properties are specified for these leaves). Besides, if you + do not specify filter properties for its child groups, they will + inherit these ones. + createparents : bool + Whether to create the needed groups for the parent + path to exist (not done by default). + + See Also + -------- + Group : for more information on groups + + """ + + parentnode = self._get_or_create_path(where, createparents) + _checkfilters(filters) + return Group(parentnode, name, + title=title, new=True, filters=filters) + + def create_table(self, where, name, description=None, title="", + filters=None, expectedrows=10_000, + chunkshape=None, byteorder=None, + createparents=False, obj=None, track_times=True): + """Create a new table with the given name in where location. + + Parameters + ---------- + where : str or Group + The parent group from which the new table will hang. It can be a + path string (for example '/level1/leaf5'), or a Group instance + (see :ref:`GroupClassDescr`). + name : str + The name of the new table. + description : Description + This is an object that describes the table, i.e. how + many columns it has, their names, types, shapes, etc. It + can be any of the following: + + * *A user-defined class*: This should inherit from the + IsDescription class (see :ref:`IsDescriptionClassDescr`) + where table fields are specified. + * *A dictionary*: For example, when you do not know + beforehand which structure your table will have). + * *A Description instance*: You can use the description + attribute of another table to create a new one with the + same structure. + * *A NumPy dtype*: A completely general structured NumPy + dtype. + * *A NumPy (structured) array instance*: The dtype of + this structured array will be used as the description. + Also, in case the array has actual data, it will be + injected into the newly created table. + + .. versionchanged:: 3.0 + The *description* parameter can be None (default) if *obj* is + provided. In that case the structure of the table is deduced + by *obj*. + + title : str + A description for this node (it sets the TITLE HDF5 attribute + on disk). + filters : Filters + An instance of the Filters class (see :ref:`FiltersClassDescr`) + that provides information about the desired I/O filters to be + applied during the life of this object. + expectedrows : int + A user estimate of the number of records that will be in the table. + If not provided, the default value is EXPECTED_ROWS_TABLE (see + :file:`tables/parameters.py`). If you plan to create a bigger + table try providing a guess; this will optimize the HDF5 B-Tree + creation and management process time and memory used. + chunkshape + The shape of the data chunk to be read or written in a + single HDF5 I/O operation. Filters are applied to those + chunks of data. The rank of the chunkshape for tables must + be 1. If None, a sensible value is calculated based on the + expectedrows parameter (which is recommended). + byteorder : str + The byteorder of data *on disk*, specified as 'little' or 'big'. + If this is not specified, the byteorder is that of the platform, + unless you passed an array as the description, in which case + its byteorder will be used. + createparents : bool + Whether to create the needed groups for the parent path to exist + (not done by default). + obj : python object + The recarray to be saved. Accepted types are NumPy record + arrays. + + The *obj* parameter is optional and it can be provided in + alternative to the *description* parameter. + If both *obj* and *description* are provided they must + be consistent with each other. + + .. versionadded:: 3.0 + + track_times + Whether time data associated with the leaf are recorded (object + access time, raw data modification time, metadata change time, + object birth time); default True. Semantics of these times + depend on their implementation in the HDF5 library: refer to + documentation of the H5O_info_t data structure. As of HDF5 + 1.8.15, only ctime (metadata change time) is implemented. + + .. versionadded:: 3.4.3 + + See Also + -------- + Table : for more information on tables + + """ + + if obj is not None: + if not isinstance(obj, np.ndarray): + raise TypeError('invalid obj parameter %r' % obj) + + descr, _ = descr_from_dtype(obj.dtype, ptparams=self.params) + if (description is not None and + dtype_from_descr(description, + ptparams=self.params) != obj.dtype): + raise TypeError('the desctiption parameter is not consistent ' + 'with the data type of the obj parameter') + elif description is None: + description = descr + + parentnode = self._get_or_create_path(where, createparents) + if description is None: + raise ValueError("invalid table description: None") + _checkfilters(filters) + + ptobj = Table(parentnode, name, + description=description, title=title, + filters=filters, expectedrows=expectedrows, + chunkshape=chunkshape, byteorder=byteorder, + track_times=track_times) + + if obj is not None: + ptobj.append(obj) + + return ptobj + + def create_array(self, where, name, obj=None, title="", + byteorder=None, createparents=False, + atom=None, shape=None, track_times=True): + """Create a new array. + + Parameters + ---------- + where : str or Group + The parent group from which the new array will hang. It can be a + path string (for example '/level1/leaf5'), or a Group instance + (see :ref:`GroupClassDescr`). + name : str + The name of the new array + obj : python object + The array or scalar to be saved. Accepted types are NumPy + arrays and scalars, as well as native Python sequences and + scalars, provided that values are regular (i.e. they are + not like ``[[1,2],2]``) and homogeneous (i.e. all the + elements are of the same type). + + Also, objects that have some of their dimensions equal to 0 + are not supported (use an EArray node (see + :ref:`EArrayClassDescr`) if you want to store an array with + one of its dimensions equal to 0). + + .. versionchanged:: 3.0 + The *Object parameter has been renamed into *obj*.* + + title : str + A description for this node (it sets the TITLE HDF5 attribute on + disk). + byteorder : str + The byteorder of the data *on disk*, specified as 'little' or + 'big'. If this is not specified, the byteorder is that of the + given object. + createparents : bool, optional + Whether to create the needed groups for the parent path to exist + (not done by default). + atom : Atom + An Atom (see :ref:`AtomClassDescr`) instance representing + the *type* and *shape* of the atomic objects to be saved. + + .. versionadded:: 3.0 + + shape : tuple of ints + The shape of the stored array. + + .. versionadded:: 3.0 + + track_times + Whether time data associated with the leaf are recorded (object + access time, raw data modification time, metadata change time, + object birth time); default True. Semantics of these times + depend on their implementation in the HDF5 library: refer to + documentation of the H5O_info_t data structure. As of HDF5 + 1.8.15, only ctime (metadata change time) is implemented. + + .. versionadded:: 3.4.3 + + See Also + -------- + Array : for more information on arrays + create_table : for more information on the rest of parameters + + """ + + if obj is None: + if atom is None or shape is None: + raise TypeError('if the obj parameter is not specified ' + '(or None) then both the atom and shape ' + 'parametes should be provided.') + else: + # Making strides=(0,...) below is a trick to create the + # array fast and without memory consumption + dflt = np.zeros((), dtype=atom.dtype) + obj = np.ndarray(shape, dtype=atom.dtype, buffer=dflt, + strides=(0,)*len(shape)) + else: + flavor = flavor_of(obj) + # use a temporary object because converting obj at this stage + # breaks some test. This is solution performs a double, + # potentially expensive, conversion of the obj parameter. + _obj = array_as_internal(obj, flavor) + + if shape is not None and shape != _obj.shape: + raise TypeError('the shape parameter do not match obj.shape') + + if atom is not None and atom.dtype != _obj.dtype: + raise TypeError('the atom parameter is not consistent with ' + 'the data type of the obj parameter') + + parentnode = self._get_or_create_path(where, createparents) + return Array(parentnode, name, + obj=obj, title=title, byteorder=byteorder, + track_times=track_times) + + def create_carray(self, where, name, atom=None, shape=None, title="", + filters=None, chunkshape=None, + byteorder=None, createparents=False, obj=None, + track_times=True): + """Create a new chunked array. + + Parameters + ---------- + where : str or Group + The parent group from which the new array will hang. It can + be a path string (for example '/level1/leaf5'), or a Group + instance (see :ref:`GroupClassDescr`). + name : str + The name of the new array + atom : Atom + An Atom (see :ref:`AtomClassDescr`) instance representing + the *type* and *shape* of the atomic objects to be saved. + + .. versionchanged:: 3.0 + The *atom* parameter can be None (default) if *obj* is + provided. + + shape : tuple + The shape of the new array. + + .. versionchanged:: 3.0 + The *shape* parameter can be None (default) if *obj* is + provided. + + title : str, optional + A description for this node (it sets the TITLE HDF5 attribute + on disk). + filters : Filters, optional + An instance of the Filters class (see :ref:`FiltersClassDescr`) + that provides information about the desired I/O filters to + be applied during the life of this object. + chunkshape : tuple or number or None, optional + The shape of the data chunk to be read or written in a + single HDF5 I/O operation. Filters are applied to those + chunks of data. The dimensionality of chunkshape must be + the same as that of shape. If None, a sensible value is + calculated (which is recommended). + byteorder : str, optional + The byteorder of the data *on disk*, specified as 'little' + or 'big'. If this is not specified, the byteorder is that + of the given object. + createparents : bool, optional + Whether to create the needed groups for the parent path to + exist (not done by default). + obj : python object + The array or scalar to be saved. Accepted types are NumPy + arrays and scalars, as well as native Python sequences and + scalars, provided that values are regular (i.e. they are + not like ``[[1,2],2]``) and homogeneous (i.e. all the + elements are of the same type). + + Also, objects that have some of their dimensions equal to 0 + are not supported. Please use an EArray node (see + :ref:`EArrayClassDescr`) if you want to store an array with + one of its dimensions equal to 0. + + The *obj* parameter is optional and it can be provided in + alternative to the *atom* and *shape* parameters. + If both *obj* and *atom* and/or *shape* are provided they must + be consistent with each other. + + .. versionadded:: 3.0 + + track_times + Whether time data associated with the leaf are recorded (object + access time, raw data modification time, metadata change time, + object birth time); default True. Semantics of these times + depend on their implementation in the HDF5 library: refer to + documentation of the H5O_info_t data structure. As of HDF5 + 1.8.15, only ctime (metadata change time) is implemented. + + .. versionadded:: 3.4.3 + + See Also + -------- + CArray : for more information on chunked arrays + + """ + + if obj is not None: + flavor = flavor_of(obj) + obj = array_as_internal(obj, flavor) + + if shape is not None and shape != obj.shape: + raise TypeError('the shape parameter do not match obj.shape') + else: + shape = obj.shape + + if atom is not None and atom.dtype != obj.dtype: + raise TypeError("the 'atom' parameter is not consistent with " + "the data type of the 'obj' parameter") + elif atom is None: + atom = Atom.from_dtype(obj.dtype) + else: + if atom is None and shape is None: + raise TypeError( + "the 'atom' and 'shape' parameters or the 'obj' parameter " + "must be provided") + + parentnode = self._get_or_create_path(where, createparents) + _checkfilters(filters) + ptobj = CArray(parentnode, name, + atom=atom, shape=shape, title=title, filters=filters, + chunkshape=chunkshape, byteorder=byteorder, + track_times=track_times) + + if obj is not None: + ptobj[...] = obj + + return ptobj + + def create_earray(self, where, name, atom=None, shape=None, title="", + filters=None, expectedrows=1000, + chunkshape=None, byteorder=None, + createparents=False, obj=None, track_times=True): + """Create a new enlargeable array. + + Parameters + ---------- + where : str or Group + The parent group from which the new array will hang. It can be a + path string (for example '/level1/leaf5'), or a Group instance + (see :ref:`GroupClassDescr`). + name : str + The name of the new array + atom : Atom + An Atom (see :ref:`AtomClassDescr`) instance representing the + *type* and *shape* of the atomic objects to be saved. + + .. versionchanged:: 3.0 + The *atom* parameter can be None (default) if *obj* is + provided. + + shape : tuple + The shape of the new array. One (and only one) of the shape + dimensions *must* be 0. The dimension being 0 means that the + resulting EArray object can be extended along it. Multiple + enlargeable dimensions are not supported right now. + + .. versionchanged:: 3.0 + The *shape* parameter can be None (default) if *obj* is + provided. + + title : str, optional + A description for this node (it sets the TITLE HDF5 attribute on + disk). + expectedrows : int, optional + A user estimate about the number of row elements that will be added + to the growable dimension in the EArray node. If not provided, the + default value is EXPECTED_ROWS_EARRAY (see tables/parameters.py). + If you plan to create either a much smaller or a much bigger array + try providing a guess; this will optimize the HDF5 B-Tree creation + and management process time and the amount of memory used. + chunkshape : tuple, numeric, or None, optional + The shape of the data chunk to be read or written in a single HDF5 + I/O operation. Filters are applied to those chunks of data. The + dimensionality of chunkshape must be the same as that of shape + (beware: no dimension should be 0 this time!). If None, a sensible + value is calculated based on the expectedrows parameter (which is + recommended). + byteorder : str, optional + The byteorder of the data *on disk*, specified as 'little' or + 'big'. If this is not specified, the byteorder is that of the + platform. + createparents : bool, optional + Whether to create the needed groups for the parent path to exist + (not done by default). + obj : python object + The array or scalar to be saved. Accepted types are NumPy + arrays and scalars, as well as native Python sequences and + scalars, provided that values are regular (i.e. they are + not like ``[[1,2],2]``) and homogeneous (i.e. all the + elements are of the same type). + + The *obj* parameter is optional and it can be provided in + alternative to the *atom* and *shape* parameters. + If both *obj* and *atom* and/or *shape* are provided they must + be consistent with each other. + + .. versionadded:: 3.0 + + track_times + Whether time data associated with the leaf are recorded (object + access time, raw data modification time, metadata change time, + object birth time); default True. Semantics of these times + depend on their implementation in the HDF5 library: refer to + documentation of the H5O_info_t data structure. As of HDF5 + 1.8.15, only ctime (metadata change time) is implemented. + + .. versionadded:: 3.4.3 + + See Also + -------- + EArray : for more information on enlargeable arrays + + """ + + if obj is not None: + flavor = flavor_of(obj) + obj = array_as_internal(obj, flavor) + + earray_shape = (0,) + obj.shape[1:] + + if shape is not None and shape != earray_shape: + raise TypeError('the shape parameter is not compatible ' + 'with obj.shape.') + else: + shape = earray_shape + + if atom is not None and atom.dtype != obj.dtype: + raise TypeError('the atom parameter is not consistent with ' + 'the data type of the obj parameter') + elif atom is None: + atom = Atom.from_dtype(obj.dtype) + + parentnode = self._get_or_create_path(where, createparents) + _checkfilters(filters) + ptobj = EArray(parentnode, name, + atom=atom, shape=shape, title=title, + filters=filters, expectedrows=expectedrows, + chunkshape=chunkshape, byteorder=byteorder, + track_times=track_times) + + if obj is not None: + ptobj.append(obj) + + return ptobj + + def create_vlarray(self, where, name, atom=None, title="", + filters=None, expectedrows=None, + chunkshape=None, byteorder=None, + createparents=False, obj=None, + track_times=True): + """Create a new variable-length array. + + Parameters + ---------- + where : str or Group + The parent group from which the new array will hang. It can + be a path string (for example '/level1/leaf5'), or a Group + instance (see :ref:`GroupClassDescr`). + name : str + The name of the new array + atom : Atom + An Atom (see :ref:`AtomClassDescr`) instance representing + the *type* and *shape* of the atomic objects to be saved. + + .. versionchanged:: 3.0 + The *atom* parameter can be None (default) if *obj* is + provided. + + title : str, optional + A description for this node (it sets the TITLE HDF5 attribute + on disk). + filters : Filters + An instance of the Filters class (see :ref:`FiltersClassDescr`) + that provides information about the desired I/O filters to + be applied during the life of this object. + expectedrows : int, optional + A user estimate about the number of row elements that will + be added to the growable dimension in the `VLArray` node. + If not provided, the default value is ``EXPECTED_ROWS_VLARRAY`` + (see ``tables/parameters.py``). If you plan to create either + a much smaller or a much bigger `VLArray` try providing a guess; + this will optimize the HDF5 B-Tree creation and management + process time and the amount of memory used. + + .. versionadded:: 3.0 + + chunkshape : int or tuple of int, optional + The shape of the data chunk to be read or written in a + single HDF5 I/O operation. Filters are applied to those + chunks of data. The dimensionality of chunkshape must be 1. + If None, a sensible value is calculated (which is recommended). + byteorder : str, optional + The byteorder of the data *on disk*, specified as 'little' or + 'big'. If this is not specified, the byteorder is that of the + platform. + createparents : bool, optional + Whether to create the needed groups for the parent path to + exist (not done by default). + obj : python object + The array or scalar to be saved. Accepted types are NumPy + arrays and scalars, as well as native Python sequences and + scalars, provided that values are regular (i.e. they are + not like ``[[1,2],2]``) and homogeneous (i.e. all the + elements are of the same type). + + The *obj* parameter is optional and it can be provided in + alternative to the *atom* parameter. + If both *obj* and *atom* and are provided they must + be consistent with each other. + + .. versionadded:: 3.0 + + track_times + Whether time data associated with the leaf are recorded (object + access time, raw data modification time, metadata change time, + object birth time); default True. Semantics of these times + depend on their implementation in the HDF5 library: refer to + documentation of the H5O_info_t data structure. As of HDF5 + 1.8.15, only ctime (metadata change time) is implemented. + + .. versionadded:: 3.4.3 + + See Also + -------- + VLArray : for more informationon variable-length arrays + + .. versionchanged:: 3.0 + The *expectedsizeinMB* parameter has been replaced by + *expectedrows*. + + """ + + if obj is not None: + flavor = flavor_of(obj) + obj = array_as_internal(obj, flavor) + + if atom is not None and atom.dtype != obj.dtype: + raise TypeError('the atom parameter is not consistent with ' + 'the data type of the obj parameter') + if atom is None: + atom = Atom.from_dtype(obj.dtype) + elif atom is None: + raise ValueError('atom parameter cannot be None') + + parentnode = self._get_or_create_path(where, createparents) + _checkfilters(filters) + ptobj = VLArray(parentnode, name, + atom=atom, title=title, filters=filters, + expectedrows=expectedrows, + chunkshape=chunkshape, byteorder=byteorder, + track_times=track_times) + + if obj is not None: + ptobj.append(obj) + + return ptobj + + def create_hard_link(self, where, name, target, createparents=False): + """Create a hard link. + + Create a hard link to a `target` node with the given `name` in + `where` location. `target` can be a node object or a path + string. If `createparents` is true, the intermediate groups + required for reaching `where` are created (the default is not + doing so). + + The returned node is a regular `Group` or `Leaf` instance. + + """ + + targetnode = self.get_node(target) + parentnode = self._get_or_create_path(where, createparents) + linkextension._g_create_hard_link(parentnode, name, targetnode) + # Refresh children names in link's parent node + parentnode._g_add_children_names() + # Return the target node + return self.get_node(parentnode, name) + + def create_soft_link(self, where, name, target, createparents=False): + """Create a soft link (aka symbolic link) to a `target` node. + + Create a soft link (aka symbolic link) to a `target` nodewith + the given `name` in `where` location. `target` can be a node + object or a path string. If `createparents` is true, the + intermediate groups required for reaching `where` are created. + + (the default is not doing so). + + The returned node is a SoftLink instance. See the SoftLink + class (in :ref:`SoftLinkClassDescr`) for more information on + soft links. + + """ + + if not isinstance(target, str): + if hasattr(target, '_v_pathname'): # quacks like a Node + target = target._v_pathname + else: + raise ValueError( + "`target` has to be a string or a node object") + parentnode = self._get_or_create_path(where, createparents) + slink = SoftLink(parentnode, name, target) + # Refresh children names in link's parent node + parentnode._g_add_children_names() + return slink + + def create_external_link(self, where, name, target, createparents=False): + """Create an external link. + + Create an external link to a *target* node with the given *name* + in *where* location. *target* can be a node object in another + file or a path string in the form 'file:/path/to/node'. If + *createparents* is true, the intermediate groups required for + reaching *where* are created (the default is not doing so). + + The returned node is an :class:`ExternalLink` instance. + + """ + + if not isinstance(target, str): + if hasattr(target, '_v_pathname'): # quacks like a Node + target = target._v_file.filename + ':' + target._v_pathname + else: + raise ValueError( + "`target` has to be a string or a node object") + elif target.find(':/') == -1: + raise ValueError( + "`target` must expressed as 'file:/path/to/node'") + parentnode = self._get_or_create_path(where, createparents) + elink = ExternalLink(parentnode, name, target) + # Refresh children names in link's parent node + parentnode._g_add_children_names() + return elink + + def _get_node(self, nodepath): + # The root node is always at hand. + if nodepath == '/': + return self.root + + node = self._node_manager.get_node(nodepath) + assert node is not None, "unable to instantiate node ``%s``" % nodepath + + return node + + def get_node(self, where, name=None, classname=None): + """Get the node under where with the given name. + + Parameters + ---------- + where : str or Node + This can be a path string leading to a node or a Node instance (see + :ref:`NodeClassDescr`). If no name is specified, that node is + returned. + + .. note:: + + If where is a Node instance from a different file than the one + on which this function is called, the returned node will also + be from that other file. + + name : str, optional + If a name is specified, this must be a string with the name of + a node under where. In this case the where argument can only + lead to a Group (see :ref:`GroupClassDescr`) instance (else a + TypeError is raised). The node called name under the group + where is returned. + classname : str, optional + If the classname argument is specified, it must be the name of + a class derived from Node (e.g. Table). If the node is found but it + is not an instance of that class, a NoSuchNodeError is also raised. + + If the node to be returned does not exist, a NoSuchNodeError is + raised. Please note that hidden nodes are also considered. + + """ + + self._check_open() + + if isinstance(where, Node): + where._g_check_open() + + basepath = where._v_pathname + nodepath = join_path(basepath, name or '') or '/' + node = where._v_file._get_node(nodepath) + elif isinstance(where, (str, np.str_)): + if not where.startswith('/'): + raise NameError("``where`` must start with a slash ('/')") + + basepath = where + nodepath = join_path(basepath, name or '') or '/' + node = self._get_node(nodepath) + else: + raise TypeError( + f"``where`` must be a string or a node: {where!r}") + + # Finally, check whether the desired node is an instance + # of the expected class. + if classname: + class_ = get_class_by_name(classname) + if not isinstance(node, class_): + npathname = node._v_pathname + nclassname = node.__class__.__name__ + # This error message is right since it can never be shown + # for ``classname in [None, 'Node']``. + raise NoSuchNodeError( + "could not find a ``%s`` node at ``%s``; " + "instead, a ``%s`` node has been found there" + % (classname, npathname, nclassname)) + + return node + + def is_visible_node(self, path): + """Is the node under `path` visible? + + If the node does not exist, a NoSuchNodeError is raised. + + """ + + # ``util.isvisiblepath()`` is still recommended for internal use. + return self.get_node(path)._f_isvisible() + + def rename_node(self, where, newname, name=None, overwrite=False): + """Change the name of the node specified by where and name to newname. + + Parameters + ---------- + where, name + These arguments work as in + :meth:`File.get_node`, referencing the node to be acted upon. + newname : str + The new name to be assigned to the node (a string). + overwrite : bool + Whether to recursively remove a node with the same + newname if it already exists (not done by default). + + """ + + obj = self.get_node(where, name=name) + obj._f_rename(newname, overwrite) + + def move_node(self, where, newparent=None, newname=None, name=None, + overwrite=False, createparents=False): + """Move the node specified by where and name to newparent/newname. + + Parameters + ---------- + where, name : path + These arguments work as in + :meth:`File.get_node`, referencing the node to be acted upon. + newparent + The destination group the node will be moved into (a + path name or a Group instance). If it is + not specified or None, the current parent + group is chosen as the new parent. + newname + The new name to be assigned to the node in its + destination (a string). If it is not specified or + None, the current name is chosen as the + new name. + + Notes + ----- + The other arguments work as in :meth:`Node._f_move`. + + """ + + obj = self.get_node(where, name=name) + obj._f_move(newparent, newname, overwrite, createparents) + + def copy_node(self, where, newparent=None, newname=None, name=None, + overwrite=False, recursive=False, createparents=False, + **kwargs): + """Copy the node specified by where and name to newparent/newname. + + Parameters + ---------- + where : str + These arguments work as in + :meth:`File.get_node`, referencing the node to be acted + upon. + newparent : str or Group + The destination group that the node will be copied + into (a path name or a Group + instance). If not specified or None, the + current parent group is chosen as the new parent. + newname : str + The name to be assigned to the new copy in its + destination (a string). If it is not specified or + None, the current name is chosen as the + new name. + name : str + These arguments work as in + :meth:`File.get_node`, referencing the node to be acted + upon. + overwrite : bool, optional + If True, the destination group will be overwritten if it already + exists. Defaults to False. + recursive : bool, optional + If True, all descendant nodes of srcgroup are recursively copied. + Defaults to False. + createparents : bool, optional + If True, any necessary parents of dstgroup will be created. + Defaults to False. + kwargs + Additional keyword arguments can be used to customize the copying + process. See the documentation of :meth:`Group._f_copy` + for a description of those arguments. + + Returns + ------- + node : Node + The newly created copy of the source node (i.e. the destination + node). See :meth:`.Node._f_copy` for further details on the + semantics of copying nodes. + + """ + + obj = self.get_node(where, name=name) + if obj._v_depth == 0 and newparent and not newname: + npobj = self.get_node(newparent) + if obj._v_file is not npobj._v_file: + # Special case for copying file1:/ --> file2:/path + self.root._f_copy_children(npobj, overwrite=overwrite, + recursive=recursive, **kwargs) + return npobj + else: + raise OSError( + "You cannot copy a root group over the same file") + return obj._f_copy(newparent, newname, + overwrite, recursive, createparents, **kwargs) + + def remove_node(self, where, name=None, recursive=False): + """Remove the object node *name* under *where* location. + + Parameters + ---------- + where, name + These arguments work as in + :meth:`File.get_node`, referencing the node to be acted upon. + recursive : bool + If not supplied or false, the node will be removed + only if it has no children; if it does, a + NodeError will be raised. If supplied + with a true value, the node and all its descendants will be + completely removed. + + """ + + obj = self.get_node(where, name=name) + obj._f_remove(recursive) + + def get_node_attr(self, where, attrname, name=None): + """Get a PyTables attribute from the given node. + + Parameters + ---------- + where, name + These arguments work as in :meth:`File.get_node`, referencing the + node to be acted upon. + attrname + The name of the attribute to retrieve. If the named + attribute does not exist, an AttributeError is raised. + + """ + + obj = self.get_node(where, name=name) + return obj._f_getattr(attrname) + + def set_node_attr(self, where, attrname, attrvalue, name=None): + """Set a PyTables attribute for the given node. + + Parameters + ---------- + where, name + These arguments work as in + :meth:`File.get_node`, referencing the node to be acted upon. + attrname + The name of the attribute to set. + attrvalue + The value of the attribute to set. Any kind of Python + object (like strings, ints, floats, lists, tuples, dicts, + small NumPy objects ...) can be stored as an attribute. + However, if necessary, pickle is automatically used so as + to serialize objects that you might want to save. + See the :class:`AttributeSet` class for details. + + Notes + ----- + If the node already has a large number of attributes, a + PerformanceWarning is issued. + + """ + + obj = self.get_node(where, name=name) + obj._f_setattr(attrname, attrvalue) + + def del_node_attr(self, where, attrname, name=None): + """Delete a PyTables attribute from the given node. + + Parameters + ---------- + where, name + These arguments work as in :meth:`File.get_node`, referencing the + node to be acted upon. + attrname + The name of the attribute to delete. If the named + attribute does not exist, an AttributeError is raised. + + """ + + obj = self.get_node(where, name=name) + obj._f_delattr(attrname) + + def copy_node_attrs(self, where, dstnode, name=None): + """Copy PyTables attributes from one node to another. + + Parameters + ---------- + where, name + These arguments work as in :meth:`File.get_node`, referencing the + node to be acted upon. + dstnode + The destination node where the attributes will be copied to. It can + be a path string or a Node instance (see :ref:`NodeClassDescr`). + + """ + + srcobject = self.get_node(where, name=name) + dstobject = self.get_node(dstnode) + srcobject._v_attrs._f_copy(dstobject) + + def copy_children(self, srcgroup, dstgroup, + overwrite=False, recursive=False, + createparents=False, **kwargs): + """Copy the children of a group into another group. + + Parameters + ---------- + srcgroup : str + The group to copy from. + dstgroup : str + The destination group. + overwrite : bool, optional + If True, the destination group will be overwritten if it already + exists. Defaults to False. + recursive : bool, optional + If True, all descendant nodes of srcgroup are recursively copied. + Defaults to False. + createparents : bool, optional + If True, any necessary parents of dstgroup will be created. + Defaults to False. + kwargs : dict + Additional keyword arguments can be used to customize the copying + process. See the documentation of :meth:`Group._f_copy_children` + for a description of those arguments. + + """ + + srcgroup = self.get_node(srcgroup) # Does the source node exist? + self._check_group(srcgroup) # Is it a group? + + srcgroup._f_copy_children( + dstgroup, overwrite, recursive, createparents, **kwargs) + + def copy_file(self, dstfilename, overwrite=False, **kwargs): + """Copy the contents of this file to dstfilename. + + Parameters + ---------- + dstfilename : str + A path string indicating the name of the destination file. If + it already exists, the copy will fail with an IOError, unless + the overwrite argument is true. + overwrite : bool, optional + If true, the destination file will be overwritten if it already + exists. In this case, the destination file must be closed, or + errors will occur. Defaults to False. + kwargs + Additional keyword arguments discussed below. + + Notes + ----- + Additional keyword arguments may be passed to customize the + copying process. For instance, title and filters may be changed, + user attributes may be or may not be copied, data may be + sub-sampled, stats may be collected, etc. Arguments unknown to + nodes are simply ignored. Check the documentation for copying + operations of nodes to see which options they support. + + In addition, it recognizes the names of parameters present in + :file:`tables/parameters.py` as additional keyword arguments. + See :ref:`parameter_files` for a detailed info on the supported + parameters. + + Copying a file usually has the beneficial side effect of + creating a more compact and cleaner version of the original + file. + + """ + + self._check_open() + + # Check that we are not treading our own shoes + if Path(self.filename).resolve() == Path(dstfilename).resolve(): + raise OSError("You cannot copy a file over itself") + + # Compute default arguments. + # These are *not* passed on. + filters = kwargs.pop('filters', None) + if filters is None: + # By checking the HDF5 attribute, we avoid setting filters + # in the destination file if not explicitly set in the + # source file. Just by assigning ``self.filters`` we would + # not be able to tell. + filters = getattr(self.root._v_attrs, 'FILTERS', None) + copyuserattrs = kwargs.get('copyuserattrs', True) + title = kwargs.pop('title', self.title) + + if Path(dstfilename).is_file() and not overwrite: + raise OSError( + f"file ``{dstfilename}`` already exists; you may want to " + f"use the ``overwrite`` argument" + ) + + # Create destination file, overwriting it. + dstfileh = open_file( + dstfilename, mode="w", title=title, filters=filters, **kwargs) + + try: + # Maybe copy the user attributes of the root group. + if copyuserattrs: + self.root._v_attrs._f_copy(dstfileh.root) + + # Copy the rest of the hierarchy. + self.root._f_copy_children(dstfileh.root, recursive=True, **kwargs) + finally: + dstfileh.close() + + def list_nodes(self, where, classname=None): + """Return a *list* with children nodes hanging from where. + + This is a list-returning version of :meth:`File.iter_nodes`. + + """ + + group = self.get_node(where) # Does the parent exist? + self._check_group(group) # Is it a group? + + return group._f_list_nodes(classname) + + def iter_nodes(self, where, classname=None): + """Iterate over children nodes hanging from where. + + Parameters + ---------- + where + This argument works as in :meth:`File.get_node`, referencing the + node to be acted upon. + classname + If the name of a class derived from + Node (see :ref:`NodeClassDescr`) is supplied, only instances of + that class (or subclasses of it) will be returned. + + Notes + ----- + The returned nodes are alphanumerically sorted by their name. + This is an iterator version of :meth:`File.list_nodes`. + + """ + + group = self.get_node(where) # Does the parent exist? + self._check_group(group) # Is it a group? + + return group._f_iter_nodes(classname) + + def __contains__(self, path): + """Is there a node with that path? + + Returns True if the file has a node with the given path (a + string), False otherwise. + + """ + + try: + self.get_node(path) + except NoSuchNodeError: + return False + else: + return True + + def __iter__(self): + """Recursively iterate over the nodes in the tree. + + This is equivalent to calling :meth:`File.walk_nodes` with no + arguments. + + Examples + -------- + + :: + + # Recursively list all the nodes in the object tree. + h5file = tables.open_file('vlarray1.h5') + print("All nodes in the object tree:") + for node in h5file: + print(node) + + """ + + return self.walk_nodes('/') + + def walk_nodes(self, where="/", classname=None): + """Recursively iterate over nodes hanging from where. + + Parameters + ---------- + where : str or Group, optional + If supplied, the iteration starts from (and includes) + this group. It can be a path string or a + Group instance (see :ref:`GroupClassDescr`). + classname + If the name of a class derived from + Node (see :ref:`GroupClassDescr`) is supplied, only instances of + that class (or subclasses of it) will be returned. + + Notes + ----- + This version iterates over the leaves in the same group in order + to avoid having a list referencing to them and thus, preventing + the LRU cache to remove them after their use. + + Examples + -------- + + :: + + # Recursively print all the nodes hanging from '/detector'. + print("Nodes hanging from group '/detector':") + for node in h5file.walk_nodes('/detector', classname='EArray'): + print(node) + + """ + + class_ = get_class_by_name(classname) + + if class_ is Group: # only groups + yield from self.walk_groups(where) + elif class_ is Node: # all nodes + yield self.get_node(where) + for group in self.walk_groups(where): + yield from self.iter_nodes(group) + else: # only nodes of the named type + for group in self.walk_groups(where): + yield from self.iter_nodes(group, classname) + + def walk_groups(self, where="/"): + """Recursively iterate over groups (not leaves) hanging from where. + + The where group itself is listed first (preorder), then each of its + child groups (following an alphanumerical order) is also traversed, + following the same procedure. If where is not supplied, the root + group is used. + + The where argument can be a path string + or a Group instance (see :ref:`GroupClassDescr`). + + """ + + group = self.get_node(where) # Does the parent exist? + self._check_group(group) # Is it a group? + return group._f_walk_groups() + + def _check_open(self): + """Check the state of the file. + + If the file is closed, a `ClosedFileError` is raised. + + """ + + if not self.isopen: + raise ClosedFileError("the file object is closed") + + def _iswritable(self): + """Is this file writable?""" + + return self.mode in ('w', 'a', 'r+') + + def _check_writable(self): + """Check whether the file is writable. + + If the file is not writable, a `FileModeError` is raised. + + """ + + if not self._iswritable(): + raise FileModeError("the file is not writable") + + def _check_group(self, node): + # `node` must already be a node. + if not isinstance(node, Group): + raise TypeError(f"node ``{node._v_pathname}`` is not a group") + + def is_undo_enabled(self): + """Is the Undo/Redo mechanism enabled? + + Returns True if the Undo/Redo mechanism has been enabled for + this file, False otherwise. Please note that this mechanism is + persistent, so a newly opened PyTables file may already have + Undo/Redo support enabled. + + """ + + self._check_open() + return self._undoEnabled + + def _check_undo_enabled(self): + if not self._undoEnabled: + raise UndoRedoError("Undo/Redo feature is currently disabled!") + + def _create_transaction_group(self): + tgroup = TransactionGroupG( + self.root, _trans_group_name, + "Transaction information container", new=True) + # The format of the transaction container. + tgroup._v_attrs._g__setattr('FORMATVERSION', _trans_version) + return tgroup + + def _create_transaction(self, troot, tid): + return TransactionG( + troot, _trans_name % tid, + "Transaction number %d" % tid, new=True) + + def _create_mark(self, trans, mid): + return MarkG( + trans, _markName % mid, + "Mark number %d" % mid, new=True) + + def enable_undo(self, filters=Filters(complevel=1)): + """Enable the Undo/Redo mechanism. + + This operation prepares the database for undoing and redoing + modifications in the node hierarchy. This + allows :meth:`File.mark`, :meth:`File.undo`, :meth:`File.redo` and + other methods to be called. + + The filters argument, when specified, + must be an instance of class Filters (see :ref:`FiltersClassDescr`) and + is meant for setting the compression values for the action log. The + default is having compression enabled, as the gains in terms of + space can be considerable. You may want to disable compression if + you want maximum speed for Undo/Redo operations. + + Calling this method when the Undo/Redo mechanism is already + enabled raises an UndoRedoError. + + """ + + maxundo = self.params['MAX_UNDO_PATH_LENGTH'] + + class ActionLog(NotLoggedMixin, Table): + pass + + class ActionLogDesc(IsDescription): + opcode = UInt8Col(pos=0) + arg1 = StringCol(maxundo, pos=1, dflt=b"") + arg2 = StringCol(maxundo, pos=2, dflt=b"") + + self._check_open() + + # Enabling several times is not allowed to avoid the user having + # the illusion that a new implicit mark has been created + # when calling enable_undo for the second time. + + if self.is_undo_enabled(): + raise UndoRedoError("Undo/Redo feature is already enabled!") + + self._markers = {} + self._seqmarkers = [] + self._nmarks = 0 + self._curtransaction = 0 + self._curmark = -1 # No marks yet + + # Get the Group for keeping user actions + try: + tgroup = self.get_node(_trans_group_path) + except NodeError: + # The file is going to be changed. + self._check_writable() + + # A transaction log group does not exist. Create it + tgroup = self._create_transaction_group() + + # Create a transaction. + self._trans = self._create_transaction( + tgroup, self._curtransaction) + + # Create an action log + self._actionlog = ActionLog( + tgroup, _action_log_name, ActionLogDesc, "Action log", + filters=filters) + + # Create an implicit mark + self._actionlog.append([(_op_to_code["MARK"], str(0), '')]) + self._nmarks += 1 + self._seqmarkers.append(0) # current action is 0 + + # Create a group for mark 0 + self._create_mark(self._trans, 0) + # Initialize the marker pointer + self._curmark = int(self._nmarks - 1) + # Initialize the action pointer + self._curaction = self._actionlog.nrows - 1 + else: + # The group seems to exist already + # Get the default transaction + self._trans = tgroup._f_get_child( + _trans_name % self._curtransaction) + # Open the action log and go to the end of it + self._actionlog = tgroup.actionlog + for row in self._actionlog: + if row["opcode"] == _op_to_code["MARK"]: + name = row["arg2"].decode('utf-8') + self._markers[name] = self._nmarks + self._seqmarkers.append(row.nrow) + self._nmarks += 1 + # Get the current mark and current action + self._curmark = int(self._actionlog.attrs.CURMARK) + self._curaction = self._actionlog.attrs.CURACTION + + # The Undo/Redo mechanism has been enabled. + self._undoEnabled = True + + def disable_undo(self): + """Disable the Undo/Redo mechanism. + + Disabling the Undo/Redo mechanism leaves the database in the + current state and forgets past and future database states. This + makes :meth:`File.mark`, :meth:`File.undo`, :meth:`File.redo` and other + methods fail with an UndoRedoError. + + Calling this method when the Undo/Redo mechanism is already + disabled raises an UndoRedoError. + + """ + + self._check_open() + + if not self.is_undo_enabled(): + raise UndoRedoError("Undo/Redo feature is already disabled!") + + # The file is going to be changed. + self._check_writable() + + del self._markers + del self._seqmarkers + del self._curmark + del self._curaction + del self._curtransaction + del self._nmarks + del self._actionlog + # Recursively delete the transaction group + tnode = self.get_node(_trans_group_path) + tnode._g_remove(recursive=1) + + # The Undo/Redo mechanism has been disabled. + self._undoEnabled = False + + def mark(self, name=None): + """Mark the state of the database. + + Creates a mark for the current state of the database. A unique (and + immutable) identifier for the mark is returned. An optional name (a + string) can be assigned to the mark. Both the identifier of a mark and + its name can be used in :meth:`File.undo` and :meth:`File.redo` + operations. When the name has already been used for another mark, + an UndoRedoError is raised. + + This method can only be called when the Undo/Redo mechanism has been + enabled. Otherwise, an UndoRedoError is raised. + + """ + + self._check_open() + self._check_undo_enabled() + + if name is None: + name = '' + else: + if not isinstance(name, str): + raise TypeError("Only strings are allowed as mark names. " + "You passed object: '%s'" % name) + if name in self._markers: + raise UndoRedoError("Name '%s' is already used as a marker " + "name. Try another one." % name) + + # The file is going to be changed. + self._check_writable() + + self._markers[name] = self._curmark + 1 + + # Create an explicit mark + # Insert the mark in the action log + self._log("MARK", str(self._curmark + 1), name) + self._curmark += 1 + self._nmarks = self._curmark + 1 + self._seqmarkers.append(self._curaction) + # Create a group for the current mark + self._create_mark(self._trans, self._curmark) + return self._curmark + + def _log(self, action, *args): + """Log an action. + + The `action` must be an all-uppercase string identifying it. + Arguments must also be strings. + + This method should be called once the action has been completed. + + This method can only be called when the Undo/Redo mechanism has + been enabled. Otherwise, an `UndoRedoError` is raised. + + """ + + assert self.is_undo_enabled() + + maxundo = self.params['MAX_UNDO_PATH_LENGTH'] + # Check whether we are at the end of the action log or not + if self._curaction != self._actionlog.nrows - 1: + # We are not, so delete the trailing actions + self._actionlog.remove_rows(self._curaction + 1, + self._actionlog.nrows) + # Reset the current marker group + mnode = self.get_node(_markPath % (self._curtransaction, + self._curmark)) + mnode._g_reset() + # Delete the marker groups with backup objects + for mark in range(self._curmark + 1, self._nmarks): + mnode = self.get_node(_markPath % (self._curtransaction, mark)) + mnode._g_remove(recursive=1) + # Update the new number of marks + self._nmarks = self._curmark + 1 + self._seqmarkers = self._seqmarkers[:self._nmarks] + + if action not in _op_to_code: # INTERNAL + raise UndoRedoError("Action ``%s`` not in ``_op_to_code`` " + "dictionary: %r" % (action, _op_to_code)) + + arg1 = "" + arg2 = "" + if len(args) <= 1: + arg1 = args[0] + elif len(args) <= 2: + arg1 = args[0] + arg2 = args[1] + else: # INTERNAL + raise UndoRedoError("Too many parameters for action log: " + "%r").with_traceback(args) + if (len(arg1) > maxundo + or len(arg2) > maxundo): # INTERNAL + raise UndoRedoError("Parameter arg1 or arg2 is too long: " + "(%r, %r)" % (arg1, arg2)) + # print("Logging-->", (action, arg1, arg2)) + self._actionlog.append([(_op_to_code[action], + arg1.encode('utf-8'), + arg2.encode('utf-8'))]) + self._curaction += 1 + + def _get_mark_id(self, mark): + """Get an integer markid from a mark sequence number or name.""" + + if isinstance(mark, int): + markid = mark + elif isinstance(mark, str): + if mark not in self._markers: + lmarkers = sorted(self._markers) + raise UndoRedoError("The mark that you have specified has not " + "been found in the internal marker list: " + "%r" % lmarkers) + markid = self._markers[mark] + else: + raise TypeError("Parameter mark can only be an integer or a " + "string, and you passed a type <%s>" % type(mark)) + # print("markid, self._nmarks:", markid, self._nmarks) + return markid + + def _get_final_action(self, markid): + """Get the action to go. + + It does not touch the self private attributes + + """ + + if markid > self._nmarks - 1: + # The required mark is beyond the end of the action log + # The final action is the last row + return self._actionlog.nrows + elif markid <= 0: + # The required mark is the first one + # return the first row + return 0 + + return self._seqmarkers[markid] + + def _doundo(self, finalaction, direction): + """Undo/Redo actions up to final action in the specificed direction.""" + + if direction < 0: + actionlog = \ + self._actionlog[finalaction + 1:self._curaction + 1][::-1] + else: + actionlog = self._actionlog[self._curaction:finalaction] + + # Uncomment this for debugging +# print("curaction, finalaction, direction", \ +# self._curaction, finalaction, direction) + for i in range(len(actionlog)): + if actionlog['opcode'][i] != _op_to_code["MARK"]: + # undo/redo the action + if direction > 0: + # Uncomment this for debugging + # print("redo-->", \ + # _code_to_op[actionlog['opcode'][i]],\ + # actionlog['arg1'][i],\ + # actionlog['arg2'][i]) + undoredo.redo(self, + # _code_to_op[actionlog['opcode'][i]], + # The next is a workaround for python < 2.5 + _code_to_op[int(actionlog['opcode'][i])], + actionlog['arg1'][i].decode('utf8'), + actionlog['arg2'][i].decode('utf8')) + else: + # Uncomment this for debugging + # print("undo-->", \ + # _code_to_op[actionlog['opcode'][i]],\ + # actionlog['arg1'][i].decode('utf8'),\ + # actionlog['arg2'][i].decode('utf8')) + undoredo.undo(self, + # _code_to_op[actionlog['opcode'][i]], + # The next is a workaround for python < 2.5 + _code_to_op[int(actionlog['opcode'][i])], + actionlog['arg1'][i].decode('utf8'), + actionlog['arg2'][i].decode('utf8')) + else: + if direction > 0: + self._curmark = int(actionlog['arg1'][i]) + else: + self._curmark = int(actionlog['arg1'][i]) - 1 + # Protection against negative marks + if self._curmark < 0: + self._curmark = 0 + self._curaction += direction + + def undo(self, mark=None): + """Go to a past state of the database. + + Returns the database to the state associated with the specified mark. + Both the identifier of a mark and its name can be used. If the mark is + omitted, the last created mark is used. If there are no past + marks, or the specified mark is not older than the current one, an + UndoRedoError is raised. + + This method can only be called when the Undo/Redo mechanism + has been enabled. Otherwise, an UndoRedoError + is raised. + + """ + + self._check_open() + self._check_undo_enabled() + +# print("(pre)UNDO: (curaction, curmark) = (%s,%s)" % \ +# (self._curaction, self._curmark)) + if mark is None: + markid = self._curmark + # Correction if we are settled on top of a mark + opcode = self._actionlog.cols.opcode + if opcode[self._curaction] == _op_to_code["MARK"]: + markid -= 1 + else: + # Get the mark ID number + markid = self._get_mark_id(mark) + # Get the final action ID to go + finalaction = self._get_final_action(markid) + if finalaction > self._curaction: + raise UndoRedoError("Mark ``%s`` is newer than the current mark. " + "Use `redo()` or `goto()` instead." % (mark,)) + + # The file is going to be changed. + self._check_writable() + + # Try to reach this mark by unwinding actions in the log + self._doundo(finalaction - 1, -1) + if self._curaction < self._actionlog.nrows - 1: + self._curaction += 1 + self._curmark = int(self._actionlog.cols.arg1[self._curaction]) +# print("(post)UNDO: (curaction, curmark) = (%s,%s)" % \ +# (self._curaction, self._curmark)) + + def redo(self, mark=None): + """Go to a future state of the database. + + Returns the database to the state associated with the specified + mark. Both the identifier of a mark and its name can be used. + If the `mark` is omitted, the next created mark is used. If + there are no future marks, or the specified mark is not newer + than the current one, an UndoRedoError is raised. + + This method can only be called when the Undo/Redo mechanism has + been enabled. Otherwise, an UndoRedoError is raised. + + """ + + self._check_open() + self._check_undo_enabled() + +# print("(pre)REDO: (curaction, curmark) = (%s, %s)" % \ +# (self._curaction, self._curmark)) + if self._curaction >= self._actionlog.nrows - 1: + # We are at the end of log, so no action + return + + if mark is None: + mark = self._curmark + 1 + elif mark == -1: + mark = int(self._nmarks) # Go beyond the mark bounds up to the end + # Get the mark ID number + markid = self._get_mark_id(mark) + finalaction = self._get_final_action(markid) + if finalaction < self._curaction + 1: + raise UndoRedoError("Mark ``%s`` is older than the current mark. " + "Use `redo()` or `goto()` instead." % (mark,)) + + # The file is going to be changed. + self._check_writable() + + # Get the final action ID to go + self._curaction += 1 + + # Try to reach this mark by redoing the actions in the log + self._doundo(finalaction, 1) + # Increment the current mark only if we are not at the end of marks + if self._curmark < self._nmarks - 1: + self._curmark += 1 + if self._curaction > self._actionlog.nrows - 1: + self._curaction = self._actionlog.nrows - 1 +# print("(post)REDO: (curaction, curmark) = (%s,%s)" % \ +# (self._curaction, self._curmark)) + + def goto(self, mark): + """Go to a specific mark of the database. + + Returns the database to the state associated with the specified mark. + Both the identifier of a mark and its name can be used. + + This method can only be called when the Undo/Redo mechanism has been + enabled. Otherwise, an UndoRedoError is raised. + + """ + + self._check_open() + self._check_undo_enabled() + + if mark == -1: # Special case + mark = self._nmarks # Go beyond the mark bounds up to the end + # Get the mark ID number + markid = self._get_mark_id(mark) + finalaction = self._get_final_action(markid) + if finalaction < self._curaction: + self.undo(mark) + else: + self.redo(mark) + + def get_current_mark(self): + """Get the identifier of the current mark. + + Returns the identifier of the current mark. This can be used + to know the state of a database after an application crash, or to + get the identifier of the initial implicit mark after a call + to :meth:`File.enable_undo`. + + This method can only be called when the Undo/Redo mechanism + has been enabled. Otherwise, an UndoRedoError + is raised. + + """ + + self._check_open() + self._check_undo_enabled() + return self._curmark + + def _shadow_name(self): + """Compute and return a shadow name. + + Computes the current shadow name according to the current + transaction, mark and action. It returns a tuple with the + shadow parent node and the name of the shadow in it. + + """ + + parent = self.get_node( + _shadow_parent % (self._curtransaction, self._curmark)) + name = _shadow_name % (self._curaction,) + + return (parent, name) + + def flush(self): + """Flush all the alive leaves in the object tree.""" + + self._check_open() + + # Flush the cache to disk + self._node_manager.flush_nodes() + self._flush_file(0) # 0 means local scope, 1 global (virtual) scope + + def close(self): + """Flush all the alive leaves in object tree and close the file.""" + + # If the file is already closed, return immediately + if not self.isopen: + return + + # If this file has been opened more than once, decrease the + # counter and return + if self._open_count > 1: + self._open_count -= 1 + return + + filename = self.filename + + if self._undoEnabled and self._iswritable(): + # Save the current mark and current action + self._actionlog.attrs._g__setattr("CURMARK", self._curmark) + self._actionlog.attrs._g__setattr("CURACTION", self._curaction) + + # Close all loaded nodes. + self.root._f_close() + + self._node_manager.shutdown() + + # Post-conditions + assert len(self._node_manager.cache) == 0, \ + ("cached nodes remain after closing: %s" + % list(self._node_manager.cache)) + + # No other nodes should have been revived. + assert len(self._node_manager.registry) == 0, \ + ("alive nodes remain after closing: %s" + % list(self._node_manager.registry)) + + # Close the file + self._close_file() + + # After the objects are disconnected, destroy the + # object dictionary using the brute force ;-) + # This should help to the garbage collector + self.__dict__.clear() + + # Set the flag to indicate that the file is closed + self.isopen = 0 + + # Restore the filename attribute that is used by _FileRegistry + self.filename = filename + + # Delete the entry from he registry of opened files + _open_files.remove(self) + + def __enter__(self): + """Enter a context and return the same file.""" + + return self + + def __exit__(self, *exc_info): + """Exit a context and close the file.""" + + self.close() + return False # do not hide exceptions + + def __str__(self): + """Return a short string representation of the object tree. + + Examples + -------- + + :: + + >>> import tables + >>> f = tables.open_file('tables/tests/Tables_lzo2.h5') + >>> print(f) + tables/tests/Tables_lzo2.h5 (File) 'Table Benchmark' + Last modif.: '...' + Object Tree: + / (RootGroup) 'Table Benchmark' + /tuple0 (Table(100,)lzo(1)) 'This is the table title' + /group0 (Group) '' + /group0/tuple1 (Table(100,)lzo(1)) 'This is the table title' + /group0/group1 (Group) '' + /group0/group1/tuple2 (Table(100,)lzo(1)) 'This is the table title' + /group0/group1/group2 (Group) '' + >>> f.close() + + """ + if not self.isopen: + return "" + + # Print all the nodes (Group and Leaf objects) on object tree + try: + date = datetime.datetime.fromtimestamp( + Path(self.filename).stat().st_mtime, datetime.timezone.utc + ).isoformat(timespec='seconds') + except OSError: + # in-memory file + date = "" + lines = [f'{self.filename} (File) {self.title!r}', + f'Last modif.: {date!r}', + 'Object Tree: '] + + for group in self.walk_groups("/"): + lines.append(f'{group}') + for kind in self._node_kinds[1:]: + for node in self.list_nodes(group, kind): + lines.append(f'{node}') + return '\n'.join(lines) + '\n' + + def __repr__(self): + """Return a detailed string representation of the object tree.""" + + if not self.isopen: + return "" + + # Print all the nodes (Group and Leaf objects) on object tree + lines = [ + f'File(filename={self.filename!s}, title={self.title!r}, ' + f'mode={self.mode!r}, root_uep={self.root_uep!r}, ' + f'filters={self.filters!r})'] + for group in self.walk_groups("/"): + lines.append(f'{group}') + for kind in self._node_kinds[1:]: + for node in self.list_nodes(group, kind): + lines.append(f'{node!r}') + return '\n'.join(lines) + '\n' + + def _update_node_locations(self, oldpath, newpath): + """Update location information of nodes under `oldpath`. + + This only affects *already loaded* nodes. + + """ + + oldprefix = oldpath + '/' # root node can not be renamed, anyway + oldprefix_len = len(oldprefix) + + # Update alive and dead descendents. + for cache in [self._node_manager.cache, self._node_manager.registry]: + for nodepath in list(cache): + if nodepath.startswith(oldprefix) and nodepath != oldprefix: + nodesuffix = nodepath[oldprefix_len:] + newnodepath = join_path(newpath, nodesuffix) + newnodeppath = split_path(newnodepath)[0] + descendent_node = self._get_node(nodepath) + descendent_node._g_update_location(newnodeppath) + + +# If a user hits ^C during a run, it is wise to gracefully close the +# opened files. +atexit.register(_open_files.close_all) diff --git a/tables/filters.py b/tables/filters.py new file mode 100644 index 0000000..190f760 --- /dev/null +++ b/tables/filters.py @@ -0,0 +1,437 @@ +"""Functionality related with filters in a PyTables file.""" + +import warnings +import numpy as np + +from . import utilsextension, blosc_compressor_list, blosc_compcode_to_compname +from .exceptions import FiltersWarning +from packaging.version import Version + +import tables as tb + +blosc_version = Version(tb.which_lib_version("blosc")[1]) + + +__docformat__ = 'reStructuredText' +"""The format of documentation strings in this module.""" + +all_complibs = ['zlib', 'lzo', 'bzip2', 'blosc'] +all_complibs += ['blosc:%s' % cname for cname in blosc_compressor_list()] + + +"""List of all compression libraries.""" + +foreign_complibs = ['szip'] +"""List of known but unsupported compression libraries.""" + +default_complib = 'zlib' +"""The default compression library.""" + + +_shuffle_flag = 0x1 +_fletcher32_flag = 0x2 +_rounding_flag = 0x4 +_bitshuffle_flag = 0x8 + + +class Filters: + """Container for filter properties. + + This class is meant to serve as a container that keeps information about + the filter properties associated with the chunked leaves, that is Table, + CArray, EArray and VLArray. + + Instances of this class can be directly compared for equality. + + Parameters + ---------- + complevel : int + Specifies a compression level for data. The allowed + range is 0-9. A value of 0 (the default) disables + compression. + complib : str + Specifies the compression library to be used. Right now, 'zlib' (the + default), 'lzo', 'bzip2' and 'blosc' are supported. Additional + compressors for Blosc like 'blosc:blosclz' ('blosclz' is the default in + case the additional compressor is not specified), 'blosc:lz4', + 'blosc:lz4hc', 'blosc:snappy', 'blosc:zlib' and 'blosc:zstd' are + supported too. Specifying a compression library which is not available + in the system issues a FiltersWarning and sets the library to the + default one. + shuffle : bool + Whether or not to use the *Shuffle* filter in the HDF5 + library. This is normally used to improve the compression + ratio. A false value disables shuffling and a true one enables + it. The default value depends on whether compression is + enabled or not; if compression is enabled, shuffling defaults + to be enabled, else shuffling is disabled. Shuffling can only + be used when compression is enabled. + bitshuffle : bool + Whether or not to use the *BitShuffle* filter in the Blosc + library. This is normally used to improve the compression + ratio. A false value disables bitshuffling and a true one + enables it. The default value is disabled. + fletcher32 : bool + Whether or not to use the + *Fletcher32* filter in the HDF5 library. + This is used to add a checksum on each data chunk. A false + value (the default) disables the checksum. + least_significant_digit : int + If specified, data will be truncated (quantized). In conjunction + with enabling compression, this produces 'lossy', but + significantly more efficient compression. For example, if + *least_significant_digit=1*, data will be quantized using + ``around(scale*data)/scale``, where ``scale = 2**bits``, and + bits is determined so that a precision of 0.1 is retained (in + this case bits=4). Default is *None*, or no quantization. + + .. note:: + + quantization is only applied if some form of compression is + enabled + + Examples + -------- + + This is a small example on using the Filters class:: + + import numpy as np + import tables as tb + + fileh = tb.open_file('test5.h5', mode='w') + atom = Float32Atom() + filters = Filters(complevel=1, complib='blosc', fletcher32=True) + arr = fileh.create_earray(fileh.root, 'earray', atom, (0,2), + "A growable array", filters=filters) + + # Append several rows in only one call + arr.append(np.array([[1., 2.], + [2., 3.], + [3., 4.]], dtype=np.float32)) + + # Print information on that enlargeable array + print("Result Array:") + print(repr(arr)) + fileh.close() + + This enforces the use of the Blosc library, a compression level of 1 and a + Fletcher32 checksum filter as well. See the output of this example:: + + Result Array: + /earray (EArray(3, 2), fletcher32, shuffle, blosc(1)) 'A growable array' + type = float32 + shape = (3, 2) + itemsize = 4 + nrows = 3 + extdim = 0 + flavor = 'numpy' + byteorder = 'little' + + .. rubric:: Filters attributes + + .. attribute:: fletcher32 + + Whether the *Fletcher32* filter is active or not. + + .. attribute:: complevel + + The compression level (0 disables compression). + + .. attribute:: complib + + The compression filter used (irrelevant when compression is not + enabled). + + .. attribute:: shuffle + + Whether the *Shuffle* filter is active or not. + + .. attribute:: bitshuffle + + Whether the *BitShuffle* filter is active or not (Blosc only). + + """ + + @property + def shuffle_bitshuffle(self): + """Encode NoShuffle (0), Shuffle (1) and BitShuffle (2) filters.""" + if (self.shuffle and self.bitshuffle): + raise ValueError( + "Shuffle and BitShuffle cannot be active at the same time") + if not (self.shuffle or self.bitshuffle): + return 0 + if self.shuffle: + return 1 + if self.bitshuffle: + return 2 + + @classmethod + def _from_leaf(cls, leaf): + # Get a dictionary with all the filters + parent = leaf._v_parent + filters_dict = utilsextension.get_filters(parent._v_objectid, + leaf._v_name) + if filters_dict is None: + filters_dict = {} # not chunked + + # Keyword arguments are all off + kwargs = dict(complevel=0, shuffle=False, bitshuffle=False, + fletcher32=False, least_significant_digit=None, + _new=False) + for (name, values) in filters_dict.items(): + if name == 'deflate': + name = 'zlib' + if name in all_complibs: + kwargs['complib'] = name + if name == "blosc": + kwargs['complevel'] = values[4] + if values[5] == 1: + # Shuffle filter is internal to blosc + kwargs['shuffle'] = True + elif values[5] == 2: + # Shuffle filter is internal to blosc + kwargs['bitshuffle'] = True + # From Blosc 1.3 on, parameter 6 is used for the compressor + if len(values) > 6: + cname = blosc_compcode_to_compname(values[6]) + kwargs['complib'] = "blosc:%s" % cname + else: + kwargs['complevel'] = values[0] + elif name in foreign_complibs: + kwargs['complib'] = name + kwargs['complevel'] = 1 # any nonzero value will do + elif name in ['shuffle', 'fletcher32']: + kwargs[name] = True + return cls(**kwargs) + + @classmethod + def _unpack(cls, packed): + """Create a new `Filters` object from a packed version. + + >>> Filters._unpack(0) + Filters(complevel=0, shuffle=False, bitshuffle=False, fletcher32=False, least_significant_digit=None) + >>> Filters._unpack(0x101) + Filters(complevel=1, complib='zlib', shuffle=False, bitshuffle=False, fletcher32=False, least_significant_digit=None) + >>> Filters._unpack(0x30109) + Filters(complevel=9, complib='zlib', shuffle=True, bitshuffle=False, fletcher32=True, least_significant_digit=None) + >>> Filters._unpack(0x3010A) + Traceback (most recent call last): + ... + ValueError: compression level must be between 0 and 9 + >>> Filters._unpack(0x1) + Traceback (most recent call last): + ... + ValueError: invalid compression library id: 0 + + """ + + kwargs = {'_new': False} + + # Byte 0: compression level. + kwargs['complevel'] = complevel = packed & 0xff + packed >>= 8 + + # Byte 1: compression library id (0 for none). + if complevel > 0: + complib_id = int(packed & 0xff) + if not (0 < complib_id <= len(all_complibs)): + raise ValueError("invalid compression library id: %d" + % complib_id) + kwargs['complib'] = all_complibs[complib_id - 1] + packed >>= 8 + + # Byte 2: parameterless filters. + kwargs['shuffle'] = packed & _shuffle_flag + kwargs['bitshuffle'] = packed & _bitshuffle_flag + kwargs['fletcher32'] = packed & _fletcher32_flag + has_rounding = packed & _rounding_flag + packed >>= 8 + + # Byte 3: least significant digit. + if has_rounding: + kwargs['least_significant_digit'] = np.int8(packed & 0xff) + else: + kwargs['least_significant_digit'] = None + + return cls(**kwargs) + + def _pack(self): + """Pack the `Filters` object into a 64-bit NumPy integer.""" + + packed = np.int64(0) + + # Byte 3: least significant digit. + if self.least_significant_digit is not None: + # assert isinstance(self.least_significant_digit, numpy.int8) + packed |= self.least_significant_digit + packed <<= 8 + + # Byte 2: parameterless filters. + if self.shuffle: + packed |= _shuffle_flag + if self.bitshuffle: + packed |= _bitshuffle_flag + if self.fletcher32: + packed |= _fletcher32_flag + if self.least_significant_digit: + packed |= _rounding_flag + packed <<= 8 + + # Byte 1: compression library id (0 for none). + if self.complevel > 0: + packed |= all_complibs.index(self.complib) + 1 + packed <<= 8 + + # Byte 0: compression level. + packed |= self.complevel + + return packed + + def __init__(self, complevel=0, complib=default_complib, + shuffle=True, bitshuffle=False, fletcher32=False, + least_significant_digit=None, _new=True): + + if not (0 <= complevel <= 9): + raise ValueError("compression level must be between 0 and 9") + + if _new and complevel > 0: + # These checks are not performed when loading filters from disk. + if complib not in all_complibs: + raise ValueError( + "compression library ``%s`` is not supported; " + "it must be one of: %s" + % (complib, ", ".join(all_complibs))) + if utilsextension.which_lib_version(complib) is None: + warnings.warn("compression library ``%s`` is not available; " + "using ``%s`` instead" + % (complib, default_complib), FiltersWarning) + complib = default_complib # always available + + complevel = int(complevel) + complib = str(complib) + shuffle = bool(shuffle) + bitshuffle = bool(bitshuffle) + fletcher32 = bool(fletcher32) + if least_significant_digit is not None: + least_significant_digit = np.int8(least_significant_digit) + + if complevel == 0: + # Override some inputs when compression is not enabled. + complib = None # make it clear there is no compression + shuffle = False # shuffling and not compressing makes no sense + least_significant_digit = None + elif complib not in all_complibs: + # Do not try to use a meaningful level for unsupported libs. + complevel = -1 + + self.complevel = complevel + """The compression level (0 disables compression).""" + + self.complib = complib + """The compression filter used (irrelevant when compression is + not enabled). + """ + + self.shuffle = shuffle + """Whether the *Shuffle* filter is active or not.""" + + self.bitshuffle = bitshuffle + """Whether the *BitShuffle* filter is active or not.""" + + if (self.complib and + self.bitshuffle and + not self.complib.startswith('blosc')): + raise ValueError("BitShuffle can only be used inside Blosc") + + if self.shuffle and self.bitshuffle: + # BitShuffle has priority in case both are specified + self.shuffle = False + + if (self.bitshuffle and + blosc_version < tb.req_versions.min_blosc_bitshuffle_version): + raise ValueError(f"This Blosc library does not have support for " + f"the bitshuffle filter. Please update to " + f"Blosc >= " + f"{tb.req_versions.min_blosc_bitshuffle_version}") + + self.fletcher32 = fletcher32 + """Whether the *Fletcher32* filter is active or not.""" + + self.least_significant_digit = least_significant_digit + """The least significant digit to which data shall be truncated.""" + + def __repr__(self): + args = [] + if self.complevel >= 0: # meaningful compression level + args.append(f'complevel={self.complevel}') + if self.complevel != 0: # compression enabled (-1 or > 0) + args.append(f'complib={self.complib!r}') + args.append(f'shuffle={self.shuffle}') + args.append(f'bitshuffle={self.bitshuffle}') + args.append(f'fletcher32={self.fletcher32}') + args.append(f'least_significant_digit={self.least_significant_digit}') + return f'{self.__class__.__name__}({", ".join(args)})' + + def __str__(self): + return repr(self) + + def __eq__(self, other): + if not isinstance(other, self.__class__): + return False + for attr in self.__dict__: + if getattr(self, attr) != getattr(other, attr): + return False + return True + + # XXX: API incompatible change for PyTables 3 line + # Overriding __eq__ blocks inheritance of __hash__ in 3.x + # def __hash__(self): + # return hash((self.__class__, self.complevel, self.complib, + # self.shuffle, self.bitshuffle, self.fletcher32)) + + def copy(self, **override): + """Get a copy of the filters, possibly overriding some arguments. + + Constructor arguments to be overridden must be passed as keyword + arguments. + + Using this method is recommended over replacing the attributes of an + instance, since instances of this class may become immutable in the + future:: + + >>> filters1 = Filters() + >>> filters2 = filters1.copy() + >>> filters1 == filters2 + True + >>> filters1 is filters2 + False + >>> filters3 = filters1.copy(complevel=1) #doctest: +ELLIPSIS + Traceback (most recent call last): + ... + ValueError: compression library ``None`` is not supported... + >>> filters3 = filters1.copy(complevel=1, complib='zlib') + >>> print(filters1) + Filters(complevel=0, shuffle=False, bitshuffle=False, fletcher32=False, least_significant_digit=None) + >>> print(filters3) + Filters(complevel=1, complib='zlib', shuffle=False, bitshuffle=False, fletcher32=False, least_significant_digit=None) + >>> filters1.copy(foobar=42) #doctest: +ELLIPSIS + Traceback (most recent call last): + ... + TypeError: ...__init__() got an unexpected keyword argument 'foobar' + + """ + + newargs = self.__dict__.copy() + newargs.update(override) + return self.__class__(**newargs) + + +def _test(): + """Run ``doctest`` on this module.""" + + import doctest + doctest.testmod() + + +if __name__ == '__main__': + _test() diff --git a/tables/flavor.py b/tables/flavor.py new file mode 100644 index 0000000..41bbbc8 --- /dev/null +++ b/tables/flavor.py @@ -0,0 +1,428 @@ +"""Utilities for handling different array flavors in PyTables. + +Variables +========= + +`__docformat`__ + The format of documentation strings in this module. +`internal_flavor` + The flavor used internally by PyTables. +`all_flavors` + List of all flavors available to PyTables. +`alias_map` + Maps old flavor names to the most similar current flavor. +`description_map` + Maps flavors to short descriptions of their supported objects. +`identifier_map` + Maps flavors to functions that can identify their objects. + + The function associated with a given flavor will return a true + value if the object passed to it can be identified as being of + that flavor. + + See the `flavor_of()` function for a friendlier interface to + flavor identification. + +`converter_map` + Maps (source, destination) flavor pairs to converter functions. + + Converter functions get an array of the source flavor and return + an array of the destination flavor. + + See the `array_of_flavor()` and `flavor_to_flavor()` functions for + friendlier interfaces to flavor conversion. + +""" + +import warnings + +import numpy as np + +from .exceptions import FlavorError, FlavorWarning + + +__docformat__ = 'reStructuredText' +"""The format of documentation strings in this module.""" + +internal_flavor = 'numpy' +"""The flavor used internally by PyTables.""" + +# This is very slightly slower than a set for a small number of values +# in terms of (infrequent) lookup time, but allows `flavor_of()` +# (which may be called much more frequently) to check for flavors in +# order, beginning with the most common one. +all_flavors = [] # filled as flavors are registered +"""List of all flavors available to PyTables.""" + +alias_map = {} # filled as flavors are registered +"""Maps old flavor names to the most similar current flavor.""" + +description_map = {} # filled as flavors are registered +"""Maps flavors to short descriptions of their supported objects.""" + +identifier_map = {} # filled as flavors are registered +"""Maps flavors to functions that can identify their objects. + +The function associated with a given flavor will return a true value +if the object passed to it can be identified as being of that flavor. + +See the `flavor_of()` function for a friendlier interface to flavor +identification. +""" + +converter_map = {} # filled as flavors are registered +"""Maps (source, destination) flavor pairs to converter functions. + +Converter functions get an array of the source flavor and return an +array of the destination flavor. + +See the `array_of_flavor()` and `flavor_to_flavor()` functions for +friendlier interfaces to flavor conversion. +""" + + +def check_flavor(flavor): + """Raise a ``FlavorError`` if the `flavor` is not valid.""" + + if flavor not in all_flavors: + available_flavs = ", ".join(flav for flav in all_flavors) + raise FlavorError( + "flavor ``%s`` is unsupported or unavailable; " + "available flavors in this system are: %s" + % (flavor, available_flavs)) + + +def array_of_flavor2(array, src_flavor, dst_flavor): + """Get a version of the given `array` in a different flavor. + + The input `array` must be of the given `src_flavor`, and the + returned array will be of the indicated `dst_flavor`. Both + flavors may be the same, but it is not guaranteed that the + returned array will be the same object as the input one in this + case. + + If the conversion is not supported, a ``FlavorError`` is raised. + + """ + + convkey = (src_flavor, dst_flavor) + if convkey not in converter_map: + raise FlavorError("conversion from flavor ``%s`` to flavor ``%s`` " + "is unsupported or unavailable in this system" + % (src_flavor, dst_flavor)) + + convfunc = converter_map[convkey] + return convfunc(array) + + +def flavor_to_flavor(array, src_flavor, dst_flavor): + """Get a version of the given `array` in a different flavor. + + The input `array` must be of the given `src_flavor`, and the + returned array will be of the indicated `dst_flavor` (see below + for an exception to this). Both flavors may be the same, but it + is not guaranteed that the returned array will be the same object + as the input one in this case. + + If the conversion is not supported, a `FlavorWarning` is issued + and the input `array` is returned as is. + + """ + + try: + return array_of_flavor2(array, src_flavor, dst_flavor) + except FlavorError as fe: + warnings.warn("%s; returning an object of the ``%s`` flavor instead" + % (fe.args[0], src_flavor), FlavorWarning) + return array + + +def internal_to_flavor(array, dst_flavor): + """Get a version of the given `array` in a different `dst_flavor`. + + The input `array` must be of the internal flavor, and the returned + array will be of the given `dst_flavor`. See `flavor_to_flavor()` + for more information. + + """ + + return flavor_to_flavor(array, internal_flavor, dst_flavor) + + +def array_as_internal(array, src_flavor): + """Get a version of the given `array` in the internal flavor. + + The input `array` must be of the given `src_flavor`, and the + returned array will be of the internal flavor. + + If the conversion is not supported, a ``FlavorError`` is raised. + + """ + + return array_of_flavor2(array, src_flavor, internal_flavor) + + +def flavor_of(array): + """Identify the flavor of a given `array`. + + If the `array` can not be matched with any flavor, a ``TypeError`` + is raised. + + """ + + for flavor in all_flavors: + if identifier_map[flavor](array): + return flavor + type_name = type(array).__name__ + supported_descs = "; ".join(description_map[fl] for fl in all_flavors) + raise TypeError( + "objects of type ``%s`` are not supported in this context, sorry; " + "supported objects are: %s" % (type_name, supported_descs)) + + +def array_of_flavor(array, dst_flavor): + """Get a version of the given `array` in a different `dst_flavor`. + + The flavor of the input `array` is guessed, and the returned array + will be of the given `dst_flavor`. + + If the conversion is not supported, a ``FlavorError`` is raised. + + """ + + return array_of_flavor2(array, flavor_of(array), dst_flavor) + + +def restrict_flavors(keep=('python',)): + """Disable all flavors except those in keep. + + Providing an empty keep sequence implies disabling all flavors (but the + internal one). If the sequence is not specified, only optional flavors are + disabled. + + .. important:: Once you disable a flavor, it can not be enabled again. + + """ + + remove = set(all_flavors) - set(keep) - {internal_flavor} + for flavor in remove: + _disable_flavor(flavor) + + +# Flavor registration +# +# The order in which flavors appear in `all_flavors` determines the +# order in which they will be tested for by `flavor_of()`, so place +# most frequent flavors first. +all_flavors.append('numpy') # this is the internal flavor + +all_flavors.append('python') # this is always supported + + +def _register_aliases(): + """Register aliases of *available* flavors.""" + + for flavor in all_flavors: + aliases = eval('_%s_aliases' % flavor) + for alias in aliases: + alias_map[alias] = flavor + + +def _register_descriptions(): + """Register descriptions of *available* flavors.""" + for flavor in all_flavors: + description_map[flavor] = eval('_%s_desc' % flavor) + + +def _register_identifiers(): + """Register identifier functions of *available* flavors.""" + + for flavor in all_flavors: + identifier_map[flavor] = eval('_is_%s' % flavor) + + +def _register_converters(): + """Register converter functions between *available* flavors.""" + + def identity(array): + return array + for src_flavor in all_flavors: + for dst_flavor in all_flavors: + # Converters with the same source and destination flavor + # are used when available, since they may perform some + # optimizations on the resulting array (e.g. making it + # contiguous). Otherwise, an identity function is used. + convfunc = None + try: + convfunc = eval(f'_conv_{src_flavor}_to_{dst_flavor}') + except NameError: + if src_flavor == dst_flavor: + convfunc = identity + if convfunc: + converter_map[(src_flavor, dst_flavor)] = convfunc + + +def _register_all(): + """Register all *available* flavors.""" + + _register_aliases() + _register_descriptions() + _register_identifiers() + _register_converters() + + +def _deregister_aliases(flavor): + """Deregister aliases of a given `flavor` (no checks).""" + + rm_aliases = [] + for (an_alias, a_flavor) in alias_map.items(): + if a_flavor == flavor: + rm_aliases.append(an_alias) + for an_alias in rm_aliases: + del alias_map[an_alias] + + +def _deregister_description(flavor): + """Deregister description of a given `flavor` (no checks).""" + + del description_map[flavor] + + +def _deregister_identifier(flavor): + """Deregister identifier function of a given `flavor` (no checks).""" + + del identifier_map[flavor] + + +def _deregister_converters(flavor): + """Deregister converter functions of a given `flavor` (no checks).""" + + rm_flavor_pairs = [] + for flavor_pair in converter_map: + if flavor in flavor_pair: + rm_flavor_pairs.append(flavor_pair) + for flavor_pair in rm_flavor_pairs: + del converter_map[flavor_pair] + + +def _disable_flavor(flavor): + """Completely disable the given `flavor` (no checks).""" + + _deregister_aliases(flavor) + _deregister_description(flavor) + _deregister_identifier(flavor) + _deregister_converters(flavor) + all_flavors.remove(flavor) + + +# Implementation of flavors +_python_aliases = [ + 'List', 'Tuple', + 'Int', 'Float', 'String', + 'VLString', 'Object', +] +_python_desc = ("homogeneous list or tuple, " + "integer, float, complex or bytes") + + +def _is_python(array): + return isinstance(array, (tuple, list, int, float, complex, bytes)) + + +_numpy_aliases = [] +_numpy_desc = "NumPy array, record or scalar" + + +if np.lib.NumpyVersion(np.__version__) >= np.lib.NumpyVersion('1.19.0'): + def toarray(array, *args, **kwargs): + with warnings.catch_warnings(): + warnings.simplefilter('error') + try: + array = np.array(array, *args, **kwargs) + except np.VisibleDeprecationWarning: + raise ValueError( + 'cannot guess the desired dtype from the input') + + return array +else: + toarray = np.array + + +def _is_numpy(array): + return isinstance(array, (np.ndarray, np.generic)) + + +def _numpy_contiguous(convfunc): + """Decorate `convfunc` to return a *contiguous* NumPy array. + + Note: When arrays are 0-strided, the copy is avoided. This allows + to use `array` to still carry info about the dtype and shape. + """ + + def conv_to_numpy(array): + nparr = convfunc(array) + if (hasattr(nparr, 'flags') and + not nparr.flags.contiguous and + sum(nparr.strides) != 0): + nparr = nparr.copy() # copying the array makes it contiguous + return nparr + conv_to_numpy.__name__ = convfunc.__name__ + conv_to_numpy.__doc__ = convfunc.__doc__ + return conv_to_numpy + + +@_numpy_contiguous +def _conv_numpy_to_numpy(array): + # Passes contiguous arrays through and converts scalars into + # scalar arrays. + nparr = np.asarray(array) + if nparr.dtype.kind == 'U': + # from Python 3 loads of common strings are disguised as Unicode + try: + # try to convert to basic 'S' type + return nparr.astype('S') + except UnicodeEncodeError: + pass + # pass on true Unicode arrays downstream in case it can be + # handled in the future + return nparr + + +@_numpy_contiguous +def _conv_python_to_numpy(array): + nparr = toarray(array) + if nparr.dtype.kind == 'U': + # from Python 3 loads of common strings are disguised as Unicode + try: + # try to convert to basic 'S' type + return nparr.astype('S') + except UnicodeEncodeError: + pass + # pass on true Unicode arrays downstream in case it can be + # handled in the future + return nparr + + +def _conv_numpy_to_python(array): + if array.shape != (): + # Lists are the default for returning multidimensional objects + array = array.tolist() + else: + # 0-dim or scalar case + array = array.item() + return array + + +# Now register everything related with *available* flavors. +_register_all() + + +def _test(): + """Run ``doctest`` on this module.""" + + import doctest + doctest.testmod() + + +if __name__ == '__main__': + _test() diff --git a/tables/group.py b/tables/group.py new file mode 100644 index 0000000..2bf2a18 --- /dev/null +++ b/tables/group.py @@ -0,0 +1,1236 @@ +"""Here is defined the Group class.""" + +import os +import weakref +import warnings + +from .misc.proxydict import ProxyDict +from . import hdf5extension +from . import utilsextension +from .registry import class_id_dict +from .exceptions import (NodeError, NoSuchNodeError, NaturalNameWarning, + PerformanceWarning) +from .filters import Filters +from .registry import get_class_by_name +from .path import check_name_validity, join_path, isvisiblename +from .node import Node, NotLoggedMixin +from .leaf import Leaf +from .unimplemented import UnImplemented, Unknown + +from .link import Link, SoftLink, ExternalLink + + +obversion = "1.0" + + +class _ChildrenDict(ProxyDict): + def _get_value_from_container(self, container, key): + return container._f_get_child(key) + + +class Group(hdf5extension.Group, Node): + """Basic PyTables grouping structure. + + Instances of this class are grouping structures containing *child* + instances of zero or more groups or leaves, together with + supporting metadata. Each group has exactly one *parent* group. + + Working with groups and leaves is similar in many ways to working + with directories and files, respectively, in a Unix filesystem. + As with Unix directories and files, objects in the object tree are + often described by giving their full (or absolute) path names. + This full path can be specified either as a string (like in + '/group1/group2') or as a complete object path written in *natural + naming* schema (like in file.root.group1.group2). + + A collateral effect of the *natural naming* schema is that the + names of members in the Group class and its instances must be + carefully chosen to avoid colliding with existing children node + names. For this reason and to avoid polluting the children + namespace all members in a Group start with some reserved prefix, + like _f_ (for public methods), _g_ (for private ones), _v_ (for + instance variables) or _c_ (for class variables). Any attempt to + create a new child node whose name starts with one of these + prefixes will raise a ValueError exception. + + Another effect of natural naming is that children named after + Python keywords or having names not valid as Python identifiers + (e.g. class, $a or 44) can not be accessed using the node.child + syntax. You will be forced to use node._f_get_child(child) to + access them (which is recommended for programmatic accesses). + + You will also need to use _f_get_child() to access an existing + child node if you set a Python attribute in the Group with the + same name as that node (you will get a NaturalNameWarning when + doing this). + + Parameters + ---------- + parentnode + The parent :class:`Group` object. + name : str + The name of this node in its parent group. + title + The title for this group + new + If this group is new or has to be read from disk + filters : Filters + A Filters instance + + + .. versionchanged:: 3.0 + *parentNode* renamed into *parentnode* + + Notes + ----- + The following documentation includes methods that are automatically + called when a Group instance is accessed in a special way. + + For instance, this class defines the __setattr__, __getattr__, + __delattr__ and __dir__ methods, and they set, get and delete + *ordinary Python attributes* as normally intended. In addition to that, + __getattr__ allows getting *child nodes* by their name for the sake of + easy interaction on the command line, as long as there is no Python + attribute with the same name. Groups also allow the interactive + completion (when using readline) of the names of child nodes. + For instance:: + + # get a Python attribute + nchild = group._v_nchildren + + # Add a Table child called 'table' under 'group'. + h5file.create_table(group, 'table', myDescription) + table = group.table # get the table child instance + group.table = 'foo' # set a Python attribute + + # (PyTables warns you here about using the name of a child node.) + foo = group.table # get a Python attribute + del group.table # delete a Python attribute + table = group.table # get the table child instance again + + Additionally, on interactive python sessions you may get autocompletions + of children named as *valid python identifiers* by pressing the `[Tab]` + key, or to use the dir() global function. + + .. rubric:: Group attributes + + The following instance variables are provided in addition to those + in Node (see :ref:`NodeClassDescr`): + + .. attribute:: _v_children + + Dictionary with all nodes hanging from this group. + + .. attribute:: _v_groups + + Dictionary with all groups hanging from this group. + + .. attribute:: _v_hidden + + Dictionary with all hidden nodes hanging from this group. + + .. attribute:: _v_leaves + + Dictionary with all leaves hanging from this group. + + .. attribute:: _v_links + + Dictionary with all links hanging from this group. + + .. attribute:: _v_unknown + + Dictionary with all unknown nodes hanging from this group. + + """ + + # Class identifier. + _c_classid = 'GROUP' + + # Children containers that should be loaded only in a lazy way. + # These are documented in the ``Group._g_add_children_names`` method. + _c_lazy_children_attrs = ( + '__members__', '_v_children', '_v_groups', '_v_leaves', + '_v_links', '_v_unknown', '_v_hidden') + + # `_v_nchildren` is a direct read-only shorthand + # for the number of *visible* children in a group. + def _g_getnchildren(self): + """The number of children hanging from this group.""" + return len(self._v_children) + + _v_nchildren = property(_g_getnchildren) + + # `_v_filters` is a direct read-write shorthand for the ``FILTERS`` + # attribute with the default `Filters` instance as a default value. + def _g_getfilters(self): + filters = getattr(self._v_attrs, 'FILTERS', None) + if filters is None: + filters = Filters() + return filters + + def _g_setfilters(self, value): + if not isinstance(value, Filters): + raise TypeError( + f"value is not an instance of `Filters`: {value!r}") + self._v_attrs.FILTERS = value + + def _g_delfilters(self): + del self._v_attrs.FILTERS + + _v_filters = property( + _g_getfilters, _g_setfilters, _g_delfilters, + """Default filter properties for child nodes. + + You can (and are encouraged to) use this property to get, set and + delete the FILTERS HDF5 attribute of the group, which stores a Filters + instance (see :ref:`FiltersClassDescr`). When the group has no such + attribute, a default Filters instance is used. + """) + + def __init__(self, parentnode, name, + title="", new=False, filters=None, + _log=True): + + # Remember to assign these values in the root group constructor + # if it does not use this one! + + # First, set attributes belonging to group objects. + + self._v_version = obversion + """The object version of this group.""" + + self._v_new = new + """Is this the first time the node has been created?""" + + self._v_new_title = title + """New title for this node.""" + + self._v_new_filters = filters + """New default filter properties for child nodes.""" + + self._v_max_group_width = parentnode._v_file.params['MAX_GROUP_WIDTH'] + """Maximum number of children on each group before warning the user. + + .. versionchanged:: 3.0 + The *_v_maxGroupWidth* attribute has been renamed into + *_v_max_group_width*. + + """ + + # Finally, set up this object as a node. + super().__init__(parentnode, name, _log) + + def _g_post_init_hook(self): + if self._v_new: + if self._v_file.params['PYTABLES_SYS_ATTRS']: + # Save some attributes for the new group on disk. + set_attr = self._v_attrs._g__setattr + # Set the title, class and version attributes. + set_attr('TITLE', self._v_new_title) + set_attr('CLASS', self._c_classid) + set_attr('VERSION', self._v_version) + + # Set the default filter properties. + newfilters = self._v_new_filters + if newfilters is None: + # If no filters have been passed in the constructor, + # inherit them from the parent group, but only if they + # have been inherited or explicitly set. + newfilters = getattr( + self._v_parent._v_attrs, 'FILTERS', None) + if newfilters is not None: + set_attr('FILTERS', newfilters) + else: + # If the file has PyTables format, get the VERSION attr + if 'VERSION' in self._v_attrs._v_attrnamessys: + self._v_version = self._v_attrs.VERSION + else: + self._v_version = "0.0 (unknown)" + # We don't need to get more attributes from disk, + # since the most important ones are defined as properties. + + def __del__(self): + if (self._v_isopen and + self._v_pathname in self._v_file._node_manager.registry and + '_v_children' in self.__dict__): + # The group is going to be killed. Rebuild weak references + # (that Python cancelled just before calling this method) so + # that they are still usable if the object is revived later. + selfref = weakref.ref(self) + self._v_children.containerref = selfref + self._v_groups.containerref = selfref + self._v_leaves.containerref = selfref + self._v_links.containerref = selfref + self._v_unknown.containerref = selfref + self._v_hidden.containerref = selfref + + super().__del__() + + def _g_get_child_group_class(self, childname): + """Get the class of a not-yet-loaded group child. + + `childname` must be the name of a *group* child. + + """ + + childCID = self._g_get_gchild_attr(childname, 'CLASS') + if childCID is not None and not isinstance(childCID, str): + childCID = childCID.decode('utf-8') + + if childCID in class_id_dict: + return class_id_dict[childCID] # look up group class + else: + return Group # default group class + + def _g_get_child_leaf_class(self, childname, warn=True): + """Get the class of a not-yet-loaded leaf child. + + `childname` must be the name of a *leaf* child. If the child + belongs to an unknown kind of leaf, or if its kind can not be + guessed, `UnImplemented` will be returned and a warning will be + issued if `warn` is true. + + """ + + if self._v_file.params['PYTABLES_SYS_ATTRS']: + childCID = self._g_get_lchild_attr(childname, 'CLASS') + if childCID is not None and not isinstance(childCID, str): + childCID = childCID.decode('utf-8') + else: + childCID = None + + if childCID in class_id_dict: + return class_id_dict[childCID] # look up leaf class + else: + # Unknown or no ``CLASS`` attribute, try a guess. + childCID2 = utilsextension.which_class(self._v_objectid, childname) + if childCID2 == 'UNSUPPORTED': + if warn: + if childCID is None: + warnings.warn( + "leaf ``%s`` is of an unsupported type; " + "it will become an ``UnImplemented`` node" + % self._g_join(childname)) + else: + warnings.warn( + ("leaf ``%s`` has an unknown class ID ``%s``; " + "it will become an ``UnImplemented`` node") + % (self._g_join(childname), childCID)) + return UnImplemented + assert childCID2 in class_id_dict + return class_id_dict[childCID2] # look up leaf class + + def _g_add_children_names(self): + """Add children names to this group taking into account their + visibility and kind.""" + + mydict = self.__dict__ + + # The names of the lazy attributes + mydict['__members__'] = members = [] + """The names of visible children nodes for readline-style completion. + """ + mydict['_v_children'] = children = _ChildrenDict(self) + """The number of children hanging from this group.""" + mydict['_v_groups'] = groups = _ChildrenDict(self) + """Dictionary with all groups hanging from this group.""" + mydict['_v_leaves'] = leaves = _ChildrenDict(self) + """Dictionary with all leaves hanging from this group.""" + mydict['_v_links'] = links = _ChildrenDict(self) + """Dictionary with all links hanging from this group.""" + mydict['_v_unknown'] = unknown = _ChildrenDict(self) + """Dictionary with all unknown nodes hanging from this group.""" + mydict['_v_hidden'] = hidden = _ChildrenDict(self) + """Dictionary with all hidden nodes hanging from this group.""" + + # Get the names of *all* child groups and leaves. + (group_names, leaf_names, link_names, unknown_names) = \ + self._g_list_group(self._v_parent) + + # Separate groups into visible groups and hidden nodes, + # and leaves into visible leaves and hidden nodes. + for (childnames, childdict) in ((group_names, groups), + (leaf_names, leaves), + (link_names, links), + (unknown_names, unknown)): + + for childname in childnames: + # See whether the name implies that the node is hidden. + # (Assigned values are entirely irrelevant.) + if isvisiblename(childname): + # Visible node. + members.insert(0, childname) + children[childname] = None + childdict[childname] = None + else: + # Hidden node. + hidden[childname] = None + + def _g_check_has_child(self, name): + """Check whether 'name' is a children of 'self' and return its type.""" + + # Get the HDF5 name matching the PyTables name. + node_type = self._g_get_objinfo(name) + if node_type == "NoSuchNode": + raise NoSuchNodeError( + "group ``%s`` does not have a child named ``%s``" + % (self._v_pathname, name)) + return node_type + + def __iter__(self): + """Iterate over the child nodes hanging directly from the group. + + This iterator is *not* recursive. + + Examples + -------- + + :: + + # Non-recursively list all the nodes hanging from '/detector' + print("Nodes in '/detector' group:") + for node in h5file.root.detector: + print(node) + + """ + + return self._f_iter_nodes() + + def __contains__(self, name): + """Is there a child with that `name`? + + Returns a true value if the group has a child node (visible or + hidden) with the given `name` (a string), false otherwise. + + """ + + self._g_check_open() + try: + self._g_check_has_child(name) + except NoSuchNodeError: + return False + return True + + def __getitem__(self, childname): + """Return the (visible or hidden) child with that `name` ( a string). + + Raise IndexError if child not exist. + """ + try: + return self._f_get_child(childname) + except NoSuchNodeError: + raise IndexError(childname) + + def _f_walknodes(self, classname=None): + """Iterate over descendant nodes. + + This method recursively walks *self* top to bottom (preorder), + iterating over child groups in alphanumerical order, and yielding + nodes. If classname is supplied, only instances of the named class are + yielded. + + If *classname* is Group, it behaves like :meth:`Group._f_walk_groups`, + yielding only groups. If you don't want a recursive behavior, + use :meth:`Group._f_iter_nodes` instead. + + Examples + -------- + + :: + + # Recursively print all the arrays hanging from '/' + print("Arrays in the object tree '/':") + for array in h5file.root._f_walknodes('Array', recursive=True): + print(array) + + """ + + self._g_check_open() + + # For compatibility with old default arguments. + if classname == '': + classname = None + + if classname == "Group": + # Recursive algorithm + yield from self._f_walk_groups() + else: + for group in self._f_walk_groups(): + yield from group._f_iter_nodes(classname) + + def _g_join(self, name): + """Helper method to correctly concatenate a name child object with the + pathname of this group.""" + + if name == "/": + # This case can happen when doing copies + return self._v_pathname + return join_path(self._v_pathname, name) + + def _g_width_warning(self): + """Issue a :exc:`PerformanceWarning` on too many children.""" + + warnings.warn("""\ +group ``%s`` is exceeding the recommended maximum number of children (%d); \ +be ready to see PyTables asking for *lots* of memory and possibly slow I/O.""" + % (self._v_pathname, self._v_max_group_width), + PerformanceWarning) + + def _g_refnode(self, childnode, childname, validate=True): + """Insert references to a `childnode` via a `childname`. + + Checks that the `childname` is valid and does not exist, then + creates references to the given `childnode` by that `childname`. + The validation of the name can be omitted by setting `validate` + to a false value (this may be useful for adding already existing + nodes to the tree). + + """ + + # Check for name validity. + if validate: + check_name_validity(childname) + childnode._g_check_name(childname) + + # Check if there is already a child with the same name. + # This can be triggered because of the user + # (via node construction or renaming/movement). + # Links are not checked here because they are copied and referenced + # using ``File.get_node`` so they already exist in `self`. + if (not isinstance(childnode, Link)) and childname in self: + raise NodeError( + "group ``%s`` already has a child node named ``%s``" + % (self._v_pathname, childname)) + + # Show a warning if there is an object attribute with that name. + if childname in self.__dict__: + warnings.warn( + "group ``%s`` already has an attribute named ``%s``; " + "you will not be able to use natural naming " + "to access the child node" + % (self._v_pathname, childname), NaturalNameWarning) + + # Check group width limits. + if (len(self._v_children) + len(self._v_hidden) >= + self._v_max_group_width): + self._g_width_warning() + + # Update members information. + # Insert references to the new child. + # (Assigned values are entirely irrelevant.) + if isvisiblename(childname): + # Visible node. + self.__members__.insert(0, childname) # enable completion + self._v_children[childname] = None # insert node + if isinstance(childnode, Unknown): + self._v_unknown[childname] = None + elif isinstance(childnode, Link): + self._v_links[childname] = None + elif isinstance(childnode, Leaf): + self._v_leaves[childname] = None + elif isinstance(childnode, Group): + self._v_groups[childname] = None + else: + # Hidden node. + self._v_hidden[childname] = None # insert node + + def _g_unrefnode(self, childname): + """Remove references to a node. + + Removes all references to the named node. + + """ + + # This can *not* be triggered because of the user. + assert childname in self, \ + ("group ``%s`` does not have a child node named ``%s``" + % (self._v_pathname, childname)) + + # Update members information, if needed + if '_v_children' in self.__dict__: + if childname in self._v_children: + # Visible node. + members = self.__members__ + member_index = members.index(childname) + del members[member_index] # disables completion + + del self._v_children[childname] # remove node + self._v_unknown.pop(childname, None) + self._v_links.pop(childname, None) + self._v_leaves.pop(childname, None) + self._v_groups.pop(childname, None) + else: + # Hidden node. + del self._v_hidden[childname] # remove node + + def _g_move(self, newparent, newname): + # Move the node to the new location. + oldpath = self._v_pathname + super()._g_move(newparent, newname) + newpath = self._v_pathname + + # Update location information in children. This node shouldn't + # be affected since it has already been relocated. + self._v_file._update_node_locations(oldpath, newpath) + + def _g_copy(self, newparent, newname, recursive, _log=True, **kwargs): + # Compute default arguments. + title = kwargs.get('title', self._v_title) + filters = kwargs.get('filters', None) + stats = kwargs.get('stats', None) + + # Fix arguments with explicit None values for backwards compatibility. + if title is None: + title = self._v_title + # If no filters have been passed to the call, copy them from the + # source group, but only if inherited or explicitly set. + if filters is None: + filters = getattr(self._v_attrs, 'FILTERS', None) + + # Create a copy of the object. + new_node = Group(newparent, newname, + title, new=True, filters=filters, _log=_log) + + # Copy user attributes if needed. + if kwargs.get('copyuserattrs', True): + self._v_attrs._g_copy(new_node._v_attrs, copyclass=True) + + # Update statistics if needed. + if stats is not None: + stats['groups'] += 1 + + if recursive: + # Copy child nodes if a recursive copy was requested. + # Some arguments should *not* be passed to children copy ops. + kwargs = kwargs.copy() + kwargs.pop('title', None) + self._g_copy_children(new_node, **kwargs) + + return new_node + + def _g_copy_children(self, newparent, **kwargs): + """Copy child nodes. + + Copies all nodes descending from this one into the specified + `newparent`. If the new parent has a child node with the same + name as one of the nodes in this group, the copy fails with a + `NodeError`, maybe resulting in a partial copy. Nothing is + logged. + + """ + + # Recursive version of children copy. + # for srcchild in self._v_children.itervalues(): + # srcchild._g_copy_as_child(newparent, **kwargs) + + # Non-recursive version of children copy. + use_hardlinks = kwargs.get('use_hardlinks', False) + if use_hardlinks: + address_map = kwargs.setdefault('address_map', {}) + + parentstack = [(self, newparent)] # [(source, destination), ...] + while parentstack: + (srcparent, dstparent) = parentstack.pop() + + if use_hardlinks: + for srcchild in srcparent._v_children.values(): + addr, rc = srcchild._get_obj_info() + if rc > 1 and addr in address_map: + where, name = address_map[addr][0] + localsrc = os.path.join(where, name) + dstparent._v_file.create_hard_link(dstparent, + srcchild.name, + localsrc) + address_map[addr].append( + (dstparent._v_pathname, srcchild.name) + ) + + # Update statistics if needed. + stats = kwargs.pop('stats', None) + if stats is not None: + stats['hardlinks'] += 1 + else: + dstchild = srcchild._g_copy_as_child(dstparent, + **kwargs) + if isinstance(srcchild, Group): + parentstack.append((srcchild, dstchild)) + + if rc > 1: + address_map[addr] = [ + (dstparent._v_pathname, srcchild.name) + ] + else: + for srcchild in srcparent._v_children.values(): + dstchild = srcchild._g_copy_as_child(dstparent, **kwargs) + if isinstance(srcchild, Group): + parentstack.append((srcchild, dstchild)) + + def _f_get_child(self, childname): + """Get the child called childname of this group. + + If the child exists (be it visible or not), it is returned. Else, a + NoSuchNodeError is raised. + + Using this method is recommended over getattr() when doing programmatic + accesses to children if childname is unknown beforehand or when its + name is not a valid Python identifier. + + """ + + self._g_check_open() + + self._g_check_has_child(childname) + + childpath = join_path(self._v_pathname, childname) + return self._v_file._get_node(childpath) + + def _f_list_nodes(self, classname=None): + """Return a *list* with children nodes. + + This is a list-returning version of :meth:`Group._f_iter_nodes()`. + + """ + + return list(self._f_iter_nodes(classname)) + + def _f_iter_nodes(self, classname=None): + """Iterate over children nodes. + + Child nodes are yielded alphanumerically sorted by node name. If the + name of a class derived from Node (see :ref:`NodeClassDescr`) is + supplied in the classname parameter, only instances of that class (or + subclasses of it) will be returned. + + This is an iterator version of :meth:`Group._f_list_nodes`. + + """ + + self._g_check_open() + + if not classname: + # Returns all the children alphanumerically sorted + for name in sorted(self._v_children): + yield self._v_children[name] + elif classname == 'Group': + # Returns all the groups alphanumerically sorted + for name in sorted(self._v_groups): + yield self._v_groups[name] + elif classname == 'Leaf': + # Returns all the leaves alphanumerically sorted + for name in sorted(self._v_leaves): + yield self._v_leaves[name] + elif classname == 'Link': + # Returns all the links alphanumerically sorted + for name in sorted(self._v_links): + yield self._v_links[name] + elif classname == 'IndexArray': + raise TypeError( + "listing ``IndexArray`` nodes is not allowed") + else: + class_ = get_class_by_name(classname) + for childname, childnode in sorted(self._v_children.items()): + if isinstance(childnode, class_): + yield childnode + + def _f_walk_groups(self): + """Recursively iterate over descendent groups (not leaves). + + This method starts by yielding *self*, and then it goes on to + recursively iterate over all child groups in alphanumerical order, top + to bottom (preorder), following the same procedure. + + """ + + self._g_check_open() + + stack = [self] + yield self + # Iterate over the descendants + while stack: + objgroup = stack.pop() + groupnames = sorted(objgroup._v_groups) + # Sort the groups before delivering. This uses the groups names + # for groups in tree (in order to sort() can classify them). + for groupname in groupnames: + # TODO: check recursion + stack.append(objgroup._v_groups[groupname]) + yield objgroup._v_groups[groupname] + + def __delattr__(self, name): + """Delete a Python attribute called name. + + This method only provides a extra warning in case the user + tries to delete a children node using __delattr__. + + To remove a children node from this group use + :meth:`File.remove_node` or :meth:`Node._f_remove`. To delete + a PyTables node attribute use :meth:`File.del_node_attr`, + :meth:`Node._f_delattr` or :attr:`Node._v_attrs``. + + If there is an attribute and a child node with the same name, + the child node will be made accessible again via natural naming. + + """ + + try: + super().__delattr__(name) # nothing particular + except AttributeError as ae: + hint = " (use ``node._f_remove()`` if you want to remove a node)" + raise ae.__class__(str(ae) + hint) + + def __dir__(self): + """Autocomplete only children named as valid python identifiers. + + Only PY3 supports this special method. + """ + subnods = [c for c in self._v_children if c.isidentifier()] + return super().__dir__() + subnods + + def __getattr__(self, name): + """Get a Python attribute or child node called name. + If the node has a child node called name it is returned, + else an AttributeError is raised. + """ + + if name in self._c_lazy_children_attrs: + self._g_add_children_names() + return self.__dict__[name] + return self._f_get_child(name) + + def __setattr__(self, name, value): + """Set a Python attribute called name with the given value. + + This method stores an *ordinary Python attribute* in the object. It + does *not* store new children nodes under this group; for that, use the + File.create*() methods (see the File class + in :ref:`FileClassDescr`). It does *neither* store a PyTables node + attribute; for that, + use :meth:`File.set_node_attr`, :meth`:Node._f_setattr` + or :attr:`Node._v_attrs`. + + If there is already a child node with the same name, a + NaturalNameWarning will be issued and the child node will not be + accessible via natural naming nor getattr(). It will still be available + via :meth:`File.get_node`, :meth:`Group._f_get_child` and children + dictionaries in the group (if visible). + + """ + + # Show a warning if there is an child node with that name. + # + # ..note:: + # + # Using ``if name in self:`` is not right since that would + # require ``_v_children`` and ``_v_hidden`` to be already set + # when the very first attribute assignments are made. + # Moreover, this warning is only concerned about clashes with + # names used in natural naming, i.e. those in ``__members__``. + # + # ..note:: + # + # The check ``'__members__' in myDict`` allows attribute + # assignment to happen before calling `Group.__init__()`, by + # avoiding to look into the still not assigned ``__members__`` + # attribute. This allows subclasses to set up some attributes + # and then call the constructor of the superclass. If the + # check above is disabled, that results in Python entering an + # endless loop on exit! + + mydict = self.__dict__ + if '__members__' in mydict and name in self.__members__: + warnings.warn( + "group ``%s`` already has a child node named ``%s``; " + "you will not be able to use natural naming " + "to access the child node" + % (self._v_pathname, name), NaturalNameWarning) + + super().__setattr__(name, value) + + def _f_flush(self): + """Flush this Group.""" + + self._g_check_open() + self._g_flush_group() + + def _g_close_descendents(self): + """Close all the *loaded* descendent nodes of this group.""" + + node_manager = self._v_file._node_manager + node_manager.close_subtree(self._v_pathname) + + def _g_close(self): + """Close this (open) group.""" + + if self._v_isopen: + # hdf5extension operations: + # Close HDF5 group. + self._g_close_group() + + # Close myself as a node. + super()._f_close() + + def _f_close(self): + """Close this group and all its descendents. + + This method has the behavior described in :meth:`Node._f_close`. + It should be noted that this operation closes all the nodes + descending from this group. + + You should not need to close nodes manually because they are + automatically opened/closed when they are loaded/evicted from + the integrated LRU cache. + + """ + + # If the group is already closed, return immediately + if not self._v_isopen: + return + + # First, close all the descendents of this group, unless a) the + # group is being deleted (evicted from LRU cache) or b) the node + # is being closed during an aborted creation, in which cases + # this is not an explicit close issued by the user. + if not (self._v__deleting or self._v_objectid is None): + self._g_close_descendents() + + # When all the descendents have been closed, close this group. + # This is done at the end because some nodes may still need to + # be loaded during the closing process; thus this node must be + # open until the very end. + self._g_close() + + def _g_remove(self, recursive=False, force=False): + """Remove (recursively if needed) the Group. + + This version correctly handles both visible and hidden nodes. + + """ + + if self._v_nchildren > 0: + if not (recursive or force): + raise NodeError("group ``%s`` has child nodes; " + "please set `recursive` or `force` to true " + "to remove it" + % (self._v_pathname,)) + + # First close all the descendents hanging from this group, + # so that it is not possible to use a node that no longer exists. + self._g_close_descendents() + + # Remove the node itself from the hierarchy. + super()._g_remove(recursive, force) + + def _f_copy(self, newparent=None, newname=None, + overwrite=False, recursive=False, createparents=False, + **kwargs): + """Copy this node and return the new one. + + This method has the behavior described in :meth:`Node._f_copy`. + In addition, it recognizes the following keyword arguments: + + Parameters + ---------- + title + The new title for the destination. If omitted or None, the + original title is used. This only applies to the topmost + node in recursive copies. + filters : Filters + Specifying this parameter overrides the original filter + properties in the source node. If specified, it must be an + instance of the Filters class (see :ref:`FiltersClassDescr`). + The default is to copy the filter properties from the source + node. + copyuserattrs + You can prevent the user attributes from being copied by setting + thisparameter to False. The default is to copy them. + stats + This argument may be used to collect statistics on the copy + process. When used, it should be a dictionary with keys 'groups', + 'leaves', 'links' and 'bytes' having a numeric value. Their values + willbe incremented to reflect the number of groups, leaves and + bytes, respectively, that have been copied during the operation. + + """ + + return super()._f_copy( + newparent, newname, + overwrite, recursive, createparents, **kwargs) + + def _f_copy_children(self, dstgroup, overwrite=False, recursive=False, + createparents=False, **kwargs): + """Copy the children of this group into another group. + + Children hanging directly from this group are copied into dstgroup, + which can be a Group (see :ref:`GroupClassDescr`) object or its + pathname in string form. If createparents is true, the needed groups + for the given destination group path to exist will be created. + + The operation will fail with a NodeError if there is a child node + in the destination group with the same name as one of the copied + children from this one, unless overwrite is true; in this case, + the former child node is recursively removed before copying the + later. + + By default, nodes descending from children groups of this node + are not copied. If the recursive argument is true, all descendant + nodes of this node are recursively copied. + + Additional keyword arguments may be passed to customize the + copying process. For instance, title and filters may be changed, + user attributes may be or may not be copied, data may be sub-sampled, + stats may be collected, etc. Arguments unknown to nodes are simply + ignored. Check the documentation for copying operations of nodes to + see which options they support. + + """ + + self._g_check_open() + + # `dstgroup` is used instead of its path to avoid accepting + # `Node` objects when `createparents` is true. Also, note that + # there is no risk of creating parent nodes and failing later + # because of destination nodes already existing. + dstparent = self._v_file._get_or_create_path(dstgroup, createparents) + self._g_check_group(dstparent) # Is it a group? + + if not overwrite: + # Abort as early as possible when destination nodes exist + # and overwriting is not enabled. + for childname in self._v_children: + if childname in dstparent: + raise NodeError( + "destination group ``%s`` already has " + "a node named ``%s``; " + "you may want to use the ``overwrite`` argument" + % (dstparent._v_pathname, childname)) + + use_hardlinks = kwargs.get('use_hardlinks', False) + if use_hardlinks: + address_map = kwargs.setdefault('address_map', {}) + + for child in self._v_children.values(): + addr, rc = child._get_obj_info() + if rc > 1 and addr in address_map: + where, name = address_map[addr][0] + localsrc = os.path.join(where, name) + dstparent._v_file.create_hard_link(dstparent, child.name, + localsrc) + address_map[addr].append( + (dstparent._v_pathname, child.name) + ) + + # Update statistics if needed. + stats = kwargs.pop('stats', None) + if stats is not None: + stats['hardlinks'] += 1 + else: + child._f_copy(dstparent, None, overwrite, recursive, + **kwargs) + if rc > 1: + address_map[addr] = [ + (dstparent._v_pathname, child.name) + ] + else: + for child in self._v_children.values(): + child._f_copy(dstparent, None, overwrite, recursive, **kwargs) + + def __str__(self): + """Return a short string representation of the group. + + Examples + -------- + + :: + + >>> import tables + >>> f = tables.open_file('tables/tests/Tables_lzo2.h5') + >>> print(f.root.group0) + /group0 (Group) '' + >>> f.close() + + """ + + return (f"{self._v_pathname} ({self.__class__.__name__}) " + f"{self._v_title!r}") + + def __repr__(self): + """Return a detailed string representation of the group. + + Examples + -------- + + :: + + >>> import tables + >>> f = tables.open_file('tables/tests/Tables_lzo2.h5') + >>> f.root.group0 + /group0 (Group) '' + children := ['group1' (Group), 'tuple1' (Table)] + >>> f.close() + + """ + + rep = [ + f'{childname!r} ({child.__class__.__name__})' + for (childname, child) in self._v_children.items() + ] + return f'{self!s}\n children := [{", ".join(rep)}]' + + +# Special definition for group root +class RootGroup(Group): + + def __init__(self, ptfile, name, title, new, filters): + mydict = self.__dict__ + + # Set group attributes. + self._v_version = obversion + self._v_new = new + if new: + self._v_new_title = title + self._v_new_filters = filters + else: + self._v_new_title = None + self._v_new_filters = None + + # Set node attributes. + self._v_file = ptfile + self._v_isopen = True # root is always open + self._v_pathname = '/' + self._v_name = '/' + self._v_depth = 0 + self._v_max_group_width = ptfile.params['MAX_GROUP_WIDTH'] + self._v__deleting = False + self._v_objectid = None # later + + # Only the root node has the file as a parent. + # Bypass __setattr__ to avoid the ``Node._v_parent`` property. + mydict['_v_parent'] = ptfile + ptfile._node_manager.register_node(self, '/') + + # hdf5extension operations (do before setting an AttributeSet): + # Update node attributes. + self._g_new(ptfile, name, init=True) + # Open the node and get its object ID. + self._v_objectid = self._g_open() + + # Set disk attributes and read children names. + # + # This *must* be postponed because this method needs the root node + # to be created and bound to ``File.root``. + # This is an exception to the rule, handled by ``File.__init()__``. + # + # self._g_post_init_hook() + + def _g_load_child(self, childname): + """Load a child node from disk. + + The child node `childname` is loaded from disk and an adequate + `Node` object is created and returned. If there is no such + child, a `NoSuchNodeError` is raised. + + """ + + if self._v_file.root_uep != "/": + childname = join_path(self._v_file.root_uep, childname) + # Is the node a group or a leaf? + node_type = self._g_check_has_child(childname) + + # Nodes that HDF5 report as H5G_UNKNOWN + if node_type == 'Unknown': + return Unknown(self, childname) + + # Guess the PyTables class suited to the node, + # build a PyTables node and return it. + if node_type == "Group": + if self._v_file.params['PYTABLES_SYS_ATTRS']: + ChildClass = self._g_get_child_group_class(childname) + else: + # Default is a Group class + ChildClass = Group + return ChildClass(self, childname, new=False) + elif node_type == "Leaf": + ChildClass = self._g_get_child_leaf_class(childname, warn=True) + # Building a leaf may still fail because of unsupported types + # and other causes. + # return ChildClass(self, childname) # uncomment for debugging + try: + return ChildClass(self, childname) + except Exception as exc: # XXX + warnings.warn( + "problems loading leaf ``%s``::\n\n" + " %s\n\n" + "The leaf will become an ``UnImplemented`` node." + % (self._g_join(childname), exc)) + # If not, associate an UnImplemented object to it + return UnImplemented(self, childname) + elif node_type == "SoftLink": + return SoftLink(self, childname) + elif node_type == "ExternalLink": + return ExternalLink(self, childname) + else: + return UnImplemented(self, childname) + + def _f_rename(self, newname): + raise NodeError("the root node can not be renamed") + + def _f_move(self, newparent=None, newname=None, createparents=False): + raise NodeError("the root node can not be moved") + + def _f_remove(self, recursive=False): + raise NodeError("the root node can not be removed") + + +class TransactionGroupG(NotLoggedMixin, Group): + _c_classid = 'TRANSGROUP' + + def _g_width_warning(self): + warnings.warn("""\ +the number of transactions is exceeding the recommended maximum (%d);\ +be ready to see PyTables asking for *lots* of memory and possibly slow I/O""" + % (self._v_max_group_width,), PerformanceWarning) + + +class TransactionG(NotLoggedMixin, Group): + _c_classid = 'TRANSG' + + def _g_width_warning(self): + warnings.warn("""\ +transaction ``%s`` is exceeding the recommended maximum number of marks (%d);\ +be ready to see PyTables asking for *lots* of memory and possibly slow I/O""" + % (self._v_pathname, self._v_max_group_width), + PerformanceWarning) + + +class MarkG(NotLoggedMixin, Group): + # Class identifier. + _c_classid = 'MARKG' + + import re + _c_shadow_name_re = re.compile(r'^a[0-9]+$') + + def _g_width_warning(self): + warnings.warn("""\ +mark ``%s`` is exceeding the recommended maximum action storage (%d nodes);\ +be ready to see PyTables asking for *lots* of memory and possibly slow I/O""" + % (self._v_pathname, self._v_max_group_width), + PerformanceWarning) + + def _g_reset(self): + """Empty action storage (nodes and attributes). + + This method empties all action storage kept in this node: nodes + and attributes. + + """ + + # Remove action storage nodes. + for child in list(self._v_children.values()): + child._g_remove(True, True) + + # Remove action storage attributes. + attrs = self._v_attrs + shname = self._c_shadow_name_re + for attrname in attrs._v_attrnamesuser[:]: + if shname.match(attrname): + attrs._g__delattr(attrname) diff --git a/tables/hdf5extension.pxd b/tables/hdf5extension.pxd new file mode 100644 index 0000000..19c346d --- /dev/null +++ b/tables/hdf5extension.pxd @@ -0,0 +1,40 @@ +######################################################################## +# +# License: BSD +# Created: +# Author: Francesc Alted - faltet@pytables.com +# +# $Id$ +# +######################################################################## + +from numpy cimport ndarray +from .definitions cimport hid_t, hsize_t, hbool_t + + +# Declaration of instance variables for shared classes +cdef class Node: + cdef object name + cdef hid_t parent_id + +cdef class Leaf(Node): + cdef hid_t dataset_id + cdef hid_t type_id + cdef hid_t base_type_id + cdef hid_t disk_type_id + cdef hsize_t *dims # Necessary to be here because of Leaf._g_truncate() + cdef _get_type_ids(self) + cdef _convert_time64(self, ndarray nparr, int sense) + +cdef class Array(Leaf): + cdef int rank + cdef hsize_t *maxdims + cdef hsize_t *dims_chunk + + +## Local Variables: +## mode: python +## py-indent-offset: 2 +## tab-width: 2 +## fill-column: 78 +## End: diff --git a/tables/hdf5extension.pyx b/tables/hdf5extension.pyx new file mode 100644 index 0000000..e86e745 --- /dev/null +++ b/tables/hdf5extension.pyx @@ -0,0 +1,2244 @@ +######################################################################## +# +# License: BSD +# Created: September 21, 2002 +# Author: Francesc Alted - faltet@pytables.com +# +# $Id$ +# +######################################################################## + +"""Cython interface between several PyTables classes and HDF5 library. + +Classes (type extensions): + + File + AttributeSet + Node + Leaf + Group + Array + VLArray + UnImplemented + +Functions: + +Misc variables: + +""" + +import os +import warnings +from collections import namedtuple + +ObjInfo = namedtuple('ObjInfo', ['addr', 'rc']) +ObjTimestamps = namedtuple('ObjTimestamps', ['atime', 'mtime', + 'ctime', 'btime']) + +import pickle + +import numpy + +from .exceptions import HDF5ExtError, DataTypeWarning + +from .utils import (check_file_access, byteorders, correct_byteorder, + SizeType) + +from .atom import Atom + +from .description import descr_from_dtype + +from .utilsextension import (encode_filename, set_blosc_max_threads, + atom_to_hdf5_type, atom_from_hdf5_type, hdf5_to_np_ext_type, create_nested_type, + pttype_to_hdf5, pt_special_kinds, npext_prefixes_to_ptkinds, hdf5_class_to_string, + platform_byteorder) + + +# Types, constants, functions, classes & other objects from everywhere +from libc.stdlib cimport malloc, free +from libc.string cimport strdup, strlen +from numpy cimport (import_array, ndarray, npy_intp, PyArray_BYTES, PyArray_DATA, + PyArray_DIMS, PyArray_NDIM, PyArray_STRIDE) +from cpython.bytes cimport (PyBytes_AsString, PyBytes_FromStringAndSize, + PyBytes_Check) +from cpython.unicode cimport PyUnicode_DecodeUTF8 + + +from .definitions cimport (uintptr_t, hid_t, herr_t, hsize_t, hvl_t, + H5S_seloper_t, H5D_FILL_VALUE_UNDEFINED, + H5O_TYPE_UNKNOWN, H5O_TYPE_GROUP, H5O_TYPE_DATASET, H5O_TYPE_NAMED_DATATYPE, + H5L_TYPE_ERROR, H5L_TYPE_HARD, H5L_TYPE_SOFT, H5L_TYPE_EXTERNAL, + H5T_class_t, H5T_sign_t, H5T_NATIVE_INT, + H5T_cset_t, H5T_CSET_ASCII, H5T_CSET_UTF8, + H5F_SCOPE_GLOBAL, H5F_ACC_TRUNC, H5F_ACC_RDONLY, H5F_ACC_RDWR, + H5P_DEFAULT, H5P_FILE_ACCESS, H5P_FILE_CREATE, H5T_DIR_DEFAULT, + H5S_SELECT_SET, H5S_SELECT_AND, H5S_SELECT_NOTB, + H5Fcreate, H5Fopen, H5Fclose, H5Fflush, H5Fget_vfd_handle, H5Fget_filesize, + H5Fget_create_plist, + H5Gcreate, H5Gopen, H5Gclose, H5Ldelete, H5Lmove, + H5Dopen, H5Dclose, H5Dread, H5Dwrite, H5Dget_type, H5Dget_create_plist, + H5Dget_space, H5Dvlen_reclaim, H5Dget_storage_size, H5Dvlen_get_buf_size, + H5Tget_native_type, H5Tclose, H5Tis_variable_str, H5Tget_sign, + H5Adelete, H5T_BITFIELD, H5T_INTEGER, H5T_FLOAT, H5T_STRING, H5Tget_order, + H5Pcreate, H5Pset_cache, H5Pclose, H5Pget_userblock, H5Pset_userblock, + H5Pset_fapl_sec2, H5Pset_fapl_log, H5Pset_fapl_stdio, H5Pset_fapl_core, + H5Pset_fapl_split, H5Pget_obj_track_times, + H5Sselect_all, H5Sselect_elements, H5Sselect_hyperslab, + H5Screate_simple, H5Sclose, + H5Oget_info, H5O_info_t, + H5ATTRset_attribute, H5ATTRset_attribute_string, + H5ATTRget_attribute, H5ATTRget_attribute_string, + H5ATTRget_attribute_vlen_string_array, + H5ATTRfind_attribute, H5ATTRget_type_ndims, H5ATTRget_dims, + H5ARRAYget_ndims, H5ARRAYget_info, + set_cache_size, get_objinfo, get_linkinfo, Giterate, Aiterate, H5UIget_info, + get_len_of_range, conv_float64_timeval32, truncate_dset, + H5_HAVE_DIRECT_DRIVER, pt_H5Pset_fapl_direct, + H5_HAVE_WINDOWS_DRIVER, pt_H5Pset_fapl_windows, + H5_HAVE_IMAGE_FILE, pt_H5Pset_file_image, pt_H5Fget_file_image, + H5Tget_size, hobj_ref_t) + +cdef int H5T_CSET_DEFAULT = 16 + +from .utilsextension cimport malloc_dims, get_native_type, cstr_to_pystr, load_reference + + +#------------------------------------------------------------------- + +cdef extern from "Python.h": + + object PyByteArray_FromStringAndSize(char *s, Py_ssize_t len) + +# Functions from HDF5 ARRAY (this is not part of HDF5 HL; it's private) +cdef extern from "H5ARRAY.h" nogil: + + herr_t H5ARRAYmake(hid_t loc_id, char *dset_name, char *obversion, + int rank, hsize_t *dims, int extdim, + hid_t type_id, hsize_t *dims_chunk, void *fill_data, + int complevel, char *complib, int shuffle, + int fletcher32, hbool_t track_times, void *data) + + herr_t H5ARRAYappend_records(hid_t dataset_id, hid_t type_id, + int rank, hsize_t *dims_orig, + hsize_t *dims_new, int extdim, void *data ) + + herr_t H5ARRAYwrite_records(hid_t dataset_id, hid_t type_id, + int rank, hsize_t *start, hsize_t *step, + hsize_t *count, void *data) + + herr_t H5ARRAYread(hid_t dataset_id, hid_t type_id, + hsize_t start, hsize_t nrows, hsize_t step, + int extdim, void *data) + + herr_t H5ARRAYreadSlice(hid_t dataset_id, hid_t type_id, + hsize_t *start, hsize_t *stop, + hsize_t *step, void *data) + + herr_t H5ARRAYreadIndex(hid_t dataset_id, hid_t type_id, int notequal, + hsize_t *start, hsize_t *stop, hsize_t *step, + void *data) + + herr_t H5ARRAYget_chunkshape(hid_t dataset_id, int rank, hsize_t *dims_chunk) + + herr_t H5ARRAYget_fill_value( hid_t dataset_id, hid_t type_id, + int *status, void *value) + + +# Functions for dealing with VLArray objects +cdef extern from "H5VLARRAY.h" nogil: + + herr_t H5VLARRAYmake( hid_t loc_id, char *dset_name, char *obversion, + int rank, hsize_t *dims, hid_t type_id, + hsize_t chunk_size, void *fill_data, int complevel, + char *complib, int shuffle, int fletcher32, + hbool_t track_times, void *data) + + herr_t H5VLARRAYappend_records( hid_t dataset_id, hid_t type_id, + int nobjects, hsize_t nrecords, + void *data ) + + herr_t H5VLARRAYmodify_records( hid_t dataset_id, hid_t type_id, + hsize_t nrow, int nobjects, + void *data ) + + herr_t H5VLARRAYget_info( hid_t dataset_id, hid_t type_id, + hsize_t *nrecords, char *base_byteorder) + + +#---------------------------------------------------------------------------- + +# Initialization code + +# The numpy API requires this function to be called before +# using any numpy facilities in an extension module. +import_array() + +#--------------------------------------------------------------------------- + +# Helper functions + +cdef hsize_t *npy_malloc_dims(int rank, npy_intp *pdims): + """Returns a malloced hsize_t dims from a npy_intp *pdims.""" + + cdef int i + cdef hsize_t *dims + + dims = NULL + if rank > 0: + dims = malloc(rank * sizeof(hsize_t)) + for i from 0 <= i < rank: + dims[i] = pdims[i] + return dims + + +cdef object getshape(int rank, hsize_t *dims): + """Return a shape (tuple) from a dims C array of rank dimensions.""" + + cdef int i + cdef object shape + + shape = [] + for i from 0 <= i < rank: + shape.append(SizeType(dims[i])) + + return tuple(shape) + + +# Helper function for quickly fetch an attribute string +cdef object get_attribute_string_or_none(hid_t node_id, char* attr_name): + """Returns a string/unicode attribute if it exists in node_id. + + It returns ``None`` in case it don't exists (or there have been problems + reading it). + + """ + + cdef char *attr_value + cdef int cset = H5T_CSET_DEFAULT + cdef object retvalue + cdef hsize_t size + + attr_value = NULL + retvalue = None # Default value + if H5ATTRfind_attribute(node_id, attr_name): + size = H5ATTRget_attribute_string(node_id, attr_name, &attr_value, &cset) + if size == 0: + if cset == H5T_CSET_UTF8: + retvalue = numpy.unicode_('') + else: + retvalue = numpy.bytes_(b'') + elif cset == H5T_CSET_UTF8: + if size == 1 and attr_value[0] == 0: + # compatibility with PyTables <= 3.1.1 + retvalue = numpy.unicode_('') + retvalue = PyUnicode_DecodeUTF8(attr_value, size, NULL) + retvalue = numpy.unicode_(retvalue) + else: + retvalue = PyBytes_FromStringAndSize(attr_value, size) + # AV: oct 2012 + # since now we use the string size got form HDF5 we have to strip + # trailing zeros used for padding. + # The entire process is quite odd but due to a bug (??) in the way + # numpy arrays are pickled in python 3 we can't assume that + # strlen(attr_value) is the actual length of the attribute + # and numpy.bytes_(attr_value) can give a truncated pickle string + retvalue = retvalue.rstrip(b'\x00') + retvalue = numpy.bytes_(retvalue) + + # Important to release attr_value, because it has been malloc'ed! + if attr_value: + free(attr_value) + + return retvalue + + +# Get the numpy dtype scalar attribute from an HDF5 type as fast as possible +cdef object get_dtype_scalar(hid_t type_id, H5T_class_t class_id, + size_t itemsize): + cdef H5T_sign_t sign + cdef object stype + + if class_id == H5T_BITFIELD: + stype = "b1" + elif class_id == H5T_INTEGER: + # Get the sign + sign = H5Tget_sign(type_id) + if (sign > 0): + stype = "i%s" % (itemsize) + else: + stype = "u%s" % (itemsize) + elif class_id == H5T_FLOAT: + stype = "f%s" % (itemsize) + elif class_id == H5T_STRING: + if H5Tis_variable_str(type_id): + raise TypeError("variable length strings are not supported yet") + stype = "S%s" % (itemsize) + + # Try to get a NumPy type. If this can't be done, return None. + try: + ntype = numpy.dtype(stype) + except TypeError: + ntype = None + return ntype + + +_supported_drivers = ( + "H5FD_SEC2", + "H5FD_DIRECT", + #"H5FD_LOG", + "H5FD_WINDOWS", + "H5FD_STDIO", + "H5FD_CORE", + #"H5FD_FAMILY", + #"H5FD_MULTI", + "H5FD_SPLIT", + #"H5FD_MPIO", + #"H5FD_MPIPOSIX", + #"H5FD_STREAM", +) + +HAVE_DIRECT_DRIVER = bool(H5_HAVE_DIRECT_DRIVER) +HAVE_WINDOWS_DRIVER = bool(H5_HAVE_WINDOWS_DRIVER) + +# Type extensions declarations (these are subclassed by PyTables +# Python classes) + +cdef class File: + cdef hid_t file_id + cdef hid_t access_plist + cdef object name + + def _g_new(self, name, pymode, **params): + cdef herr_t err = 0 + cdef hid_t access_plist, create_plist = H5P_DEFAULT + cdef hid_t meta_plist_id = H5P_DEFAULT, raw_plist_id = H5P_DEFAULT + cdef size_t img_buf_len = 0, user_block_size = 0 + cdef void *img_buf_p = NULL + cdef bytes encname + #cdef bytes logfile_name + + # Check if we can handle the driver + driver = params["DRIVER"] + if driver is not None and driver not in _supported_drivers: + raise ValueError("Invalid or not supported driver: '%s'" % driver) + if driver == "H5FD_SPLIT": + meta_ext = params.get("DRIVER_SPLIT_META_EXT", "-m.h5") + raw_ext = params.get("DRIVER_SPLIT_RAW_EXT", "-r.h5") + meta_name = meta_ext % name if "%s" in meta_ext else name + meta_ext + raw_name = raw_ext % name if "%s" in raw_ext else name + raw_ext + enc_meta_ext = encode_filename(meta_ext) + enc_raw_ext = encode_filename(raw_ext) + + # Create a new file using default properties + self.name = name + + # Encode the filename in case it is unicode + encname = encode_filename(name) + + # These fields can be seen from Python. + self._v_new = None # this will be computed later + # """Is this file going to be created from scratch?""" + + self._isPTFile = True # assume a PyTables file by default + # """Does this HDF5 file have a PyTables format?""" + + assert pymode in ('r', 'r+', 'a', 'w'), ("an invalid mode string ``%s`` " + "passed the ``check_file_access()`` test; " + "please report this to the authors" % pymode) + + image = params.get('DRIVER_CORE_IMAGE') + if image: + if driver != "H5FD_CORE": + warnings.warn("The DRIVER_CORE_IMAGE parameter will be ignored by " + "the '%s' driver" % driver) + elif not PyBytes_Check(image): + raise TypeError("The DRIVER_CORE_IMAGE must be a string of bytes") + elif not H5_HAVE_IMAGE_FILE: + raise RuntimeError("Support for image files is only available in " + "HDF5 >= 1.8.9") + + # After the following check we can be quite sure + # that the file or directory exists and permissions are right. + if driver == "H5FD_SPLIT": + for n in meta_name, raw_name: + check_file_access(n, pymode) + else: + backing_store = params.get("DRIVER_CORE_BACKING_STORE", 1) + if driver != "H5FD_CORE" or backing_store: + check_file_access(name, pymode) + + # Should a new file be created? + if image: + exists = True + elif driver == "H5FD_SPLIT": + exists = os.path.exists(meta_name) and os.path.exists(raw_name) + else: + exists = os.path.exists(name) + self._v_new = not (pymode in ('r', 'r+') or (pymode == 'a' and exists)) + + user_block_size = params.get("USER_BLOCK_SIZE", 0) + if user_block_size and not self._v_new: + warnings.warn("The HDF5 file already esists: the USER_BLOCK_SIZE " + "will be ignored") + elif user_block_size: + user_block_size = int(user_block_size) + is_pow_of_2 = ((user_block_size & (user_block_size - 1)) == 0) + if user_block_size < 512 or not is_pow_of_2: + raise ValueError("The USER_BLOCK_SIZE must be a power od 2 greather " + "than 512 or zero") + + # File creation property list + create_plist = H5Pcreate(H5P_FILE_CREATE) + err = H5Pset_userblock(create_plist, user_block_size) + if err < 0: + H5Pclose(create_plist) + raise HDF5ExtError("Unable to set the user block size") + + # File access property list + access_plist = H5Pcreate(H5P_FILE_ACCESS) + + # Set parameters for chunk cache + H5Pset_cache(access_plist, 0, + params["CHUNK_CACHE_NELMTS"], + params["CHUNK_CACHE_SIZE"], + params["CHUNK_CACHE_PREEMPT"]) + + # Set the I/O driver + if driver == "H5FD_SEC2": + err = H5Pset_fapl_sec2(access_plist) + elif driver == "H5FD_DIRECT": + if not H5_HAVE_DIRECT_DRIVER: + H5Pclose(create_plist) + H5Pclose(access_plist) + raise RuntimeError("The H5FD_DIRECT driver is not available") + err = pt_H5Pset_fapl_direct(access_plist, + params["DRIVER_DIRECT_ALIGNMENT"], + params["DRIVER_DIRECT_BLOCK_SIZE"], + params["DRIVER_DIRECT_CBUF_SIZE"]) + #elif driver == "H5FD_LOG": + # if "DRIVER_LOG_FILE" not in params: + # H5Pclose(access_plist) + # raise ValueError("The DRIVER_LOG_FILE parameter is required for " + # "the H5FD_LOG driver") + # logfile_name = encode_filename(params["DRIVER_LOG_FILE"]) + # err = H5Pset_fapl_log(access_plist, + # logfile_name, + # params["DRIVER_LOG_FLAGS"], + # params["DRIVER_LOG_BUF_SIZE"]) + elif driver == "H5FD_WINDOWS": + if not H5_HAVE_WINDOWS_DRIVER: + H5Pclose(access_plist) + H5Pclose(create_plist) + raise RuntimeError("The H5FD_WINDOWS driver is not available") + err = pt_H5Pset_fapl_windows(access_plist) + elif driver == "H5FD_STDIO": + err = H5Pset_fapl_stdio(access_plist) + elif driver == "H5FD_CORE": + err = H5Pset_fapl_core(access_plist, + params["DRIVER_CORE_INCREMENT"], + backing_store) + if image: + img_buf_len = len(image) + img_buf_p = PyBytes_AsString(image) + err = pt_H5Pset_file_image(access_plist, img_buf_p, img_buf_len) + if err < 0: + H5Pclose(create_plist) + H5Pclose(access_plist) + raise HDF5ExtError("Unable to set the file image") + + #elif driver == "H5FD_FAMILY": + # H5Pset_fapl_family(access_plist, + # params["DRIVER_FAMILY_MEMB_SIZE"], + # fapl_id) + #elif driver == "H5FD_MULTI": + # err = H5Pset_fapl_multi(access_plist, memb_map, memb_fapl, memb_name, + # memb_addr, relax) + elif driver == "H5FD_SPLIT": + err = H5Pset_fapl_split(access_plist, enc_meta_ext, meta_plist_id, + enc_raw_ext, raw_plist_id) + if err < 0: + e = HDF5ExtError("Unable to set the file access property list") + H5Pclose(create_plist) + H5Pclose(access_plist) + raise e + + if pymode == 'r': + self.file_id = H5Fopen(encname, H5F_ACC_RDONLY, access_plist) + elif pymode == 'r+': + self.file_id = H5Fopen(encname, H5F_ACC_RDWR, access_plist) + elif pymode == 'a': + if exists: + # A test for logging. + ## H5Pset_sieve_buf_size(access_plist, 0) + ## H5Pset_fapl_log (access_plist, "test.log", H5FD_LOG_LOC_WRITE, 0) + self.file_id = H5Fopen(encname, H5F_ACC_RDWR, access_plist) + else: + self.file_id = H5Fcreate(encname, H5F_ACC_TRUNC, create_plist, + access_plist) + elif pymode == 'w': + self.file_id = H5Fcreate(encname, H5F_ACC_TRUNC, create_plist, + access_plist) + + if self.file_id < 0: + e = HDF5ExtError("Unable to open/create file '%s'" % name) + H5Pclose(create_plist) + H5Pclose(access_plist) + raise e + + H5Pclose(create_plist) + H5Pclose(access_plist) + + # Set the cache size + set_cache_size(self.file_id, params["METADATA_CACHE_SIZE"]) + + # Set the maximum number of threads for Blosc + set_blosc_max_threads(params["MAX_BLOSC_THREADS"]) + + # XXX: add the possibility to pass a pre-allocated buffer + def get_file_image(self): + """Retrieves an in-memory image of an existing, open HDF5 file. + + .. note:: this method requires HDF5 >= 1.8.9. + + .. versionadded:: 3.0 + + """ + + cdef ssize_t size = 0 + cdef size_t buf_len = 0 + cdef bytes image + cdef char* cimage + + self.flush() + + # retrieve the size of the buffer for the file image + size = pt_H5Fget_file_image(self.file_id, NULL, buf_len) + if size < 0: + raise HDF5ExtError("Unable to retrieve the size of the buffer for the " + "file image. Plese note that not all drivers " + "provide support for image files.") + + # allocate the memory buffer + image = PyBytes_FromStringAndSize(NULL, size) + if not image: + raise RuntimeError("Unable to allecote meomory fir the file image") + + cimage = image + buf_len = size + size = pt_H5Fget_file_image(self.file_id, cimage, buf_len) + if size < 0: + raise HDF5ExtError("Unable to retrieve the file image. " + "Plese note that not all drivers provide support " + "for image files.") + + return image + + def get_filesize(self): + """Returns the size of an HDF5 file. + + The returned size is that of the entire file, as opposed to only + the HDF5 portion of the file. I.e., size includes the user block, + if any, the HDF5 portion of the file, and any data that may have + been appended beyond the data written through the HDF5 Library. + + .. versionadded:: 3.0 + + """ + + cdef herr_t err = 0 + cdef hsize_t size = 0 + + err = H5Fget_filesize(self.file_id, &size) + if err < 0: + raise HDF5ExtError("Unable to retrieve the HDF5 file size") + + return size + + def get_userblock_size(self): + """Retrieves the size of a user block. + + .. versionadded:: 3.0 + + """ + + cdef herr_t err = 0 + cdef hsize_t size = 0 + cdef hid_t create_plist + + create_plist = H5Fget_create_plist(self.file_id) + if create_plist < 0: + raise HDF5ExtError("Unable to get the creation property list") + + err = H5Pget_userblock(create_plist, &size) + if err < 0: + H5Pclose(create_plist) + raise HDF5ExtError("unable to retrieve the user block size") + + H5Pclose(create_plist) + + return size + + # Accessor definitions + def _get_file_id(self): + return self.file_id + + def fileno(self): + """Return the underlying OS integer file descriptor. + + This is needed for lower-level file interfaces, such as the ``fcntl`` + module. + + """ + + cdef void *file_handle + cdef uintptr_t *descriptor + cdef herr_t err + err = H5Fget_vfd_handle(self.file_id, H5P_DEFAULT, &file_handle) + if err < 0: + raise HDF5ExtError( + "Problems getting file descriptor for file ``%s``" % self.name) + # Convert the 'void *file_handle' into an 'int *descriptor' + descriptor = file_handle + return descriptor[0] + + + def _flush_file(self, scope): + # Close the file + H5Fflush(self.file_id, scope) + + + def _close_file(self): + # Close the file + H5Fclose( self.file_id ) + self.file_id = 0 # Means file closed + + + # This method is moved out of scope, until we provide code to delete + # the memory booked by this extension types + def __dealloc__(self): + cdef int ret + if self.file_id > 0: + # Close the HDF5 file because user didn't do that! + ret = H5Fclose(self.file_id) + if ret < 0: + raise HDF5ExtError("Problems closing the file '%s'" % self.name) + + +cdef class AttributeSet: + cdef object name + + def _g_new(self, node): + self.name = node._v_name + + def _g_list_attr(self, node): + """Return a tuple with the attribute list""" + a = Aiterate(node._v_objectid) + return a + + + def _g_setattr(self, node, name, object value): + """Save Python or NumPy objects as HDF5 attributes. + + Scalar Python objects, scalar NumPy & 0-dim NumPy objects will all be + saved as H5T_SCALAR type. N-dim NumPy objects will be saved as H5T_ARRAY + type. + + """ + + cdef int ret + cdef hid_t dset_id, type_id + cdef hsize_t *dims + cdef ndarray ndv + cdef object byteorder, rabyteorder, baseatom + cdef char* cname = NULL + cdef bytes encoded_name + cdef int cset = H5T_CSET_DEFAULT + + encoded_name = name.encode('utf-8') + # get the C pointer + cname = encoded_name + + # The dataset id of the node + dset_id = node._v_objectid + + # Convert a NumPy scalar into a NumPy 0-dim ndarray + if isinstance(value, numpy.generic): + value = numpy.array(value) + + # Check if value is a NumPy ndarray and of a supported type + if (isinstance(value, numpy.ndarray) and + value.dtype.kind in ('V', 'S', 'b', 'i', 'u', 'f', 'c')): + # get a contiguous array: fixes #270 and gh-176 + #value = numpy.ascontiguousarray(value) + value = value.copy() + if value.dtype.kind == 'V': + description, rabyteorder = descr_from_dtype(value.dtype, ptparams=node._v_file.params) + byteorder = byteorders[rabyteorder] + type_id = create_nested_type(description, byteorder) + # Make sure the value is consistent with offsets of the description + value = value.astype(description._v_dtype) + else: + # Get the associated native HDF5 type of the scalar type + baseatom = Atom.from_dtype(value.dtype.base) + byteorder = byteorders[value.dtype.byteorder] + type_id = atom_to_hdf5_type(baseatom, byteorder) + # Get dimensionality info + ndv = value + dims = npy_malloc_dims(PyArray_NDIM(ndv), PyArray_DIMS(ndv)) + # Actually write the attribute + ret = H5ATTRset_attribute(dset_id, cname, type_id, + PyArray_NDIM(ndv), dims, PyArray_BYTES(ndv)) + if ret < 0: + raise HDF5ExtError("Can't set attribute '%s' in node:\n %s." % + (name, self._v_node)) + # Release resources + free(dims) + H5Tclose(type_id) + else: + # Object cannot be natively represented in HDF5. + if (isinstance(value, numpy.ndarray) and + value.dtype.kind == 'U' and + value.shape == ()): + value = value[()].encode('utf-8') + cset = H5T_CSET_UTF8 + else: + # Convert this object to a null-terminated string + # (binary pickles are not supported at this moment) + value = pickle.dumps(value, 0) + + ret = H5ATTRset_attribute_string(dset_id, cname, value, len(value), cset) + if ret < 0: + raise HDF5ExtError("Can't set attribute '%s' in node:\n %s." % + (name, self._v_node)) + + + # Get attributes + def _g_getattr(self, node, attrname): + """Get HDF5 attributes and retrieve them as NumPy objects. + + H5T_SCALAR types will be retrieved as scalar NumPy. + H5T_ARRAY types will be retrieved as ndarray NumPy objects. + + """ + + cdef hsize_t *dims + cdef H5T_class_t class_id + cdef size_t type_size + cdef hid_t mem_type, dset_id, type_id, native_type + cdef int rank, ret, enumtype + cdef void *rbuf + cdef char *str_value + cdef char **str_values = NULL + cdef ndarray ndvalue + cdef object shape, stype_atom, shape_atom, retvalue + cdef int i, nelements + cdef char* cattrname = NULL + cdef bytes encoded_attrname + cdef int cset = H5T_CSET_DEFAULT + + encoded_attrname = attrname.encode('utf-8') + # Get the C pointer + cattrname = encoded_attrname + + # The dataset id of the node + dset_id = node._v_objectid + dims = NULL + + ret = H5ATTRget_type_ndims(dset_id, cattrname, &type_id, &class_id, + &type_size, &rank ) + if ret < 0: + raise HDF5ExtError("Can't get type info on attribute %s in node %s." % + (attrname, self.name)) + + # Call a fast function for scalar values and typical class types + if (rank == 0 and class_id == H5T_STRING): + type_size = H5ATTRget_attribute_string(dset_id, cattrname, &str_value, + &cset) + if type_size == 0: + if cset == H5T_CSET_UTF8: + retvalue = numpy.unicode_('') + else: + retvalue = numpy.bytes_(b'') + + elif cset == H5T_CSET_UTF8: + if type_size == 1 and str_value[0] == 0: + # compatibility with PyTables <= 3.1.1 + retvalue = numpy.unicode_('') + retvalue = PyUnicode_DecodeUTF8(str_value, type_size, NULL) + retvalue = numpy.unicode_(retvalue) + else: + retvalue = PyBytes_FromStringAndSize(str_value, type_size) + # AV: oct 2012 + # since now we use the string size got form HDF5 we have to strip + # trailing zeros used for padding. + # The entire process is quite odd but due to a bug (??) in the way + # numpy arrays are pickled in python 3 we can't assume that + # strlen(attr_value) is the actual length of the attibute + # and numpy.bytes_(attr_value) can give a truncated pickle sting + retvalue = retvalue.rstrip(b'\x00') + retvalue = numpy.bytes_(retvalue) # bytes + # Important to release attr_value, because it has been malloc'ed! + if str_value: + free(str_value) + H5Tclose(type_id) + return retvalue + elif (rank == 0 and class_id in (H5T_BITFIELD, H5T_INTEGER, H5T_FLOAT)): + dtype_ = get_dtype_scalar(type_id, class_id, type_size) + if dtype_ is None: + warnings.warn("Unsupported type for attribute '%s' in node '%s'. " + "Offending HDF5 class: %d" % (attrname, self.name, + class_id), DataTypeWarning) + self._v_unimplemented.append(attrname) + return None + shape = () + else: + # General case + + # Get the dimensional info + dims = malloc(rank * sizeof(hsize_t)) + ret = H5ATTRget_dims(dset_id, cattrname, dims) + if ret < 0: + raise HDF5ExtError("Can't get dims info on attribute %s in node %s." % + (attrname, self.name)) + shape = getshape(rank, dims) + # dims is not needed anymore + free( dims) + + # Get the NumPy dtype from the type_id + try: + stype_, shape_ = hdf5_to_np_ext_type(type_id, pure_numpy_types=True, ptparams=node._v_file.params) + dtype_ = numpy.dtype(stype_, shape_) + except TypeError: + if class_id == H5T_STRING and H5Tis_variable_str(type_id): + nelements = H5ATTRget_attribute_vlen_string_array(dset_id, cattrname, + &str_values, &cset) + if nelements < 0: + raise HDF5ExtError("Can't read attribute %s in node %s." % + (attrname, self.name)) + + # The following generator expressions do not work with Cython 0.15.1 + if cset == H5T_CSET_UTF8: + #retvalue = numpy.fromiter( + # PyUnicode_DecodeUTF8(str_values[i], + # strlen(str_values[i]), + # NULL) + # for i in range(nelements), "O8") + retvalue = numpy.array([ + PyUnicode_DecodeUTF8(str_values[i], + strlen(str_values[i]), + NULL) + for i in range(nelements)], "O8") + + else: + #retvalue = numpy.fromiter( + # str_values[i] for i in range(nelements), "O8") + retvalue = numpy.array( + [str_values[i] for i in range(nelements)], "O8") + retvalue.shape = shape + + # Important to release attr_value, because it has been malloc'ed! + for i in range(nelements): + free(str_values[i]) + free(str_values) + + return retvalue + + # This class is not supported. Instead of raising a TypeError, issue a + # warning explaining the problem. This will allow to continue browsing + # native HDF5 files, while informing the user about the problem. + warnings.warn("Unsupported type for attribute '%s' in node '%s'. " + "Offending HDF5 class: %d" % (attrname, self.name, + class_id), DataTypeWarning) + self._v_unimplemented.append(attrname) + return None + + # Get the container for data + ndvalue = numpy.empty(dtype=dtype_, shape=shape) + # Get the pointer to the buffer data area + rbuf = PyArray_DATA(ndvalue) + # Actually read the attribute from disk + ret = H5ATTRget_attribute(dset_id, cattrname, type_id, rbuf) + if ret < 0: + raise HDF5ExtError("Attribute %s exists in node %s, but can't get it." % + (attrname, self.name)) + H5Tclose(type_id) + + if rank > 0: # multidimensional case + retvalue = ndvalue + else: + retvalue = ndvalue[()] # 0-dim ndarray becomes a NumPy scalar + + return retvalue + + + def _g_remove(self, node, attrname): + cdef int ret + cdef hid_t dset_id + cdef char *cattrname = NULL + cdef bytes encoded_attrname + + encoded_attrname = attrname.encode('utf-8') + # Get the C pointer + cattrname = encoded_attrname + + # The dataset id of the node + dset_id = node._v_objectid + + ret = H5Adelete(dset_id, cattrname) + if ret < 0: + raise HDF5ExtError("Attribute '%s' exists in node '%s', but cannot be " + "deleted." % (attrname, self.name)) + + +cdef class Node: + # Instance variables declared in .pxd + + def _g_new(self, where, name, init): + self.name = name + # """The name of this node in its parent group.""" + self.parent_id = where._v_objectid + # """The identifier of the parent group.""" + + def _g_delete(self, parent): + cdef int ret + cdef bytes encoded_name + + encoded_name = self.name.encode('utf-8') + + # Delete this node + ret = H5Ldelete(parent._v_objectid, encoded_name, H5P_DEFAULT) + if ret < 0: + raise HDF5ExtError("problems deleting the node ``%s``" % self.name) + return ret + + def __dealloc__(self): + self.parent_id = 0 + + def _get_obj_info(self): + cdef herr_t ret = 0 + cdef H5O_info_t oinfo + + ret = H5Oget_info(self._v_objectid, &oinfo) + if ret < 0: + raise HDF5ExtError("Unable to get object info for '%s'" % + self. _v_pathname) + + return ObjInfo(oinfo.addr, oinfo.rc) + + def _get_obj_timestamps(self): + cdef herr_t ret = 0 + cdef H5O_info_t oinfo + + ret = H5Oget_info(self._v_objectid, &oinfo) + if ret < 0: + raise HDF5ExtError("Unable to get object info for '%s'" % + self. _v_pathname) + + return ObjTimestamps(oinfo.atime, oinfo.mtime, oinfo.ctime, + oinfo.btime) + + +cdef class Group(Node): + cdef hid_t group_id + + def _g_create(self): + cdef hid_t ret + cdef bytes encoded_name + + encoded_name = self.name.encode('utf-8') + + # @TODO: set property list --> utf-8 + + # Create a new group + ret = H5Gcreate(self.parent_id, encoded_name, H5P_DEFAULT, H5P_DEFAULT, + H5P_DEFAULT) + if ret < 0: + raise HDF5ExtError("Can't create the group %s." % self.name) + self.group_id = ret + return self.group_id + + def _g_open(self): + cdef hid_t ret + cdef bytes encoded_name + + encoded_name = self.name.encode('utf-8') + + ret = H5Gopen(self.parent_id, encoded_name, H5P_DEFAULT) + if ret < 0: + raise HDF5ExtError("Can't open the group: '%s'." % self.name) + self.group_id = ret + return self.group_id + + def _g_get_objinfo(self, object h5name): + """Check whether 'name' is a children of 'self' and return its type.""" + + cdef int ret + cdef object node_type + cdef bytes encoded_name + cdef char *cname + + encoded_name = h5name.encode('utf-8') + # Get the C pointer + cname = encoded_name + + ret = get_linkinfo(self.group_id, cname) + if ret == -2 or ret == H5L_TYPE_ERROR: + node_type = "NoSuchNode" + elif ret == H5L_TYPE_SOFT: + node_type = "SoftLink" + elif ret == H5L_TYPE_EXTERNAL: + node_type = "ExternalLink" + elif ret == H5L_TYPE_HARD: + ret = get_objinfo(self.group_id, cname) + if ret == -2: + node_type = "NoSuchNode" + elif ret == H5O_TYPE_UNKNOWN: + node_type = "Unknown" + elif ret == H5O_TYPE_GROUP: + node_type = "Group" + elif ret == H5O_TYPE_DATASET: + node_type = "Leaf" + elif ret == H5O_TYPE_NAMED_DATATYPE: + node_type = "NamedType" # Not supported yet + #else H5O_TYPE_LINK: + # # symbolic link + # raise RuntimeError('unexpected object type') + else: + node_type = "Unknown" + return node_type + + def _g_list_group(self, parent): + """Return a tuple with the groups and the leaves hanging from self.""" + + cdef bytes encoded_name + + encoded_name = self.name.encode('utf-8') + + return Giterate(parent._v_objectid, self._v_objectid, encoded_name) + + + def _g_get_gchild_attr(self, group_name, attr_name): + """Return an attribute of a child `Group`. + + If the attribute does not exist, ``None`` is returned. + + """ + + cdef hid_t gchild_id + cdef object retvalue + cdef bytes encoded_group_name + cdef bytes encoded_attr_name + + encoded_group_name = group_name.encode('utf-8') + encoded_attr_name = attr_name.encode('utf-8') + + # Open the group + retvalue = None # Default value + gchild_id = H5Gopen(self.group_id, encoded_group_name, H5P_DEFAULT) + if gchild_id < 0: + raise HDF5ExtError("Non-existing node ``%s`` under ``%s``" % + (group_name, self._v_pathname)) + retvalue = get_attribute_string_or_none(gchild_id, encoded_attr_name) + # Close child group + H5Gclose(gchild_id) + + return retvalue + + + def _g_get_lchild_attr(self, leaf_name, attr_name): + """Return an attribute of a child `Leaf`. + + If the attribute does not exist, ``None`` is returned. + + """ + + cdef hid_t leaf_id + cdef object retvalue + cdef bytes encoded_leaf_name + cdef bytes encoded_attr_name + + encoded_leaf_name = leaf_name.encode('utf-8') + encoded_attr_name = attr_name.encode('utf-8') + + # Open the dataset + leaf_id = H5Dopen(self.group_id, encoded_leaf_name, H5P_DEFAULT) + if leaf_id < 0: + raise HDF5ExtError("Non-existing node ``%s`` under ``%s``" % + (leaf_name, self._v_pathname)) + retvalue = get_attribute_string_or_none(leaf_id, encoded_attr_name) + # Close the dataset + H5Dclose(leaf_id) + return retvalue + + + def _g_flush_group(self): + # Close the group + H5Fflush(self.group_id, H5F_SCOPE_GLOBAL) + + + def _g_close_group(self): + cdef int ret + + ret = H5Gclose(self.group_id) + if ret < 0: + raise HDF5ExtError("Problems closing the Group %s" % self.name) + self.group_id = 0 # indicate that this group is closed + + + def _g_move_node(self, hid_t oldparent, oldname, hid_t newparent, newname, + oldpathname, newpathname): + cdef int ret + cdef bytes encoded_oldname, encoded_newname + + encoded_oldname = oldname.encode('utf-8') + encoded_newname = newname.encode('utf-8') + + ret = H5Lmove(oldparent, encoded_oldname, newparent, encoded_newname, + H5P_DEFAULT, H5P_DEFAULT) + if ret < 0: + raise HDF5ExtError("Problems moving the node %s to %s" % + (oldpathname, newpathname) ) + return ret + + + +cdef class Leaf(Node): + # Instance variables declared in .pxd + + def _get_storage_size(self): + return H5Dget_storage_size(self.dataset_id) + + def _get_obj_track_times(self): + """Get track_times boolean for dataset + + Uses H5Pget_obj_track_times to determine if the dataset was + created with the track_times property. If the leaf is not a + dataset, this will fail with HDF5ExtError. + + The track times dataset creation property does not seem to survive + closing and reopening as of HDF5 1.8.17. Currently, it may be + more accurate to test whether the ctime for the dataset is 0: + track_times = (leaf._get_obj_timestamps().ctime == 0) + """ + cdef: + hbool_t track_times = True + + if self.dataset_id < 0: + raise ValueError('Invalid dataset id %s' % self.dataset_id) + + plist_id = H5Dget_create_plist(self.dataset_id) + if plist_id < 0: + raise HDF5ExtError("Could not get dataset creation property list " + "from dataset id %s" % self.dataset_id) + + try: + # Get track_times boolean for dataset + if H5Pget_obj_track_times(plist_id, &track_times) < 0: + raise HDF5ExtError("Could not get dataset track_times property " + "from dataset id %s" % self.dataset_id) + finally: + H5Pclose(plist_id) + + return bool(track_times) + + def _g_new(self, where, name, init): + if init: + # Put this info to 0 just when the class is initialized + self.dataset_id = -1 + self.type_id = -1 + self.base_type_id = -1 + self.disk_type_id = -1 + super()._g_new(where, name, init) + + cdef _get_type_ids(self): + """Get the disk and native HDF5 types associated with this leaf. + + It is guaranteed that both disk and native types are not the same + descriptor (so that it is safe to close them separately). + + """ + + cdef hid_t disk_type_id, native_type_id + + disk_type_id = H5Dget_type(self.dataset_id) + native_type_id = get_native_type(disk_type_id) + return disk_type_id, native_type_id + + cdef _convert_time64(self, ndarray nparr, int sense): + """Converts a NumPy of Time64 elements between NumPy and HDF5 formats. + + NumPy to HDF5 conversion is performed when 'sense' is 0. Otherwise, HDF5 + to NumPy conversion is performed. The conversion is done in place, + i.e. 'nparr' is modified. + + """ + + cdef void *t64buf + cdef long byteoffset, bytestride, nelements + cdef hsize_t nrecords + + byteoffset = 0 # NumPy objects doesn't have an offset + if (nparr).shape == (): + # 0-dim array does contain *one* element + nrecords = 1 + bytestride = 8 + else: + nrecords = len(nparr) + bytestride = PyArray_STRIDE(nparr, 0) # supports multi-dimensional recarray + nelements = nparr.size // nrecords + t64buf = PyArray_DATA(nparr) + + conv_float64_timeval32( + t64buf, byteoffset, bytestride, nrecords, nelements, sense) + + # can't do since cdef'd + + def _g_truncate(self, hsize_t size): + """Truncate a Leaf to `size` nrows.""" + + cdef hsize_t ret + + ret = truncate_dset(self.dataset_id, self.maindim, size) + if ret < 0: + raise HDF5ExtError("Problems truncating the leaf: %s" % self) + + classname = self.__class__.__name__ + if classname in ('EArray', 'CArray'): + # Update the new dimensionality + self.dims[self.maindim] = size + # Update the shape + shape = list(self.shape) + shape[self.maindim] = SizeType(size) + self.shape = tuple(shape) + elif classname in ('Table', 'VLArray'): + self.nrows = size + else: + raise ValueError("Unexpected classname: %s" % classname) + + def _g_flush(self): + # Flush the dataset (in fact, the entire buffers in file!) + if self.dataset_id >= 0: + H5Fflush(self.dataset_id, H5F_SCOPE_GLOBAL) + + def _g_close(self): + # Close dataset in HDF5 space + # Release resources + if self.type_id >= 0: + H5Tclose(self.type_id) + if self.disk_type_id >= 0: + H5Tclose(self.disk_type_id) + if self.base_type_id >= 0: + H5Tclose(self.base_type_id) + if self.dataset_id >= 0: + H5Dclose(self.dataset_id) + + +cdef void* _array_data(ndarray arr): + # When the object is not a 0-d ndarray and its strides == 0, that + # means that the array does not contain actual data + cdef npy_intp i, ndim + + ndim = PyArray_NDIM(arr) + if ndim == 0: + return PyArray_DATA(arr) + for i in range(ndim): + if PyArray_STRIDE(arr, i) > 0: + return PyArray_DATA(arr) + return NULL + +cdef class Array(Leaf): + # Instance variables declared in .pxd + + def _create_array(self, ndarray nparr, object title, object atom): + cdef int i + cdef herr_t ret + cdef void *rbuf + cdef bytes complib, version, class_ + cdef object dtype_, atom_, shape + cdef ndarray dims + cdef bytes encoded_title, encoded_name + cdef H5T_cset_t cset = H5T_CSET_ASCII + + encoded_title = title.encode('utf-8') + encoded_name = self.name.encode('utf-8') + + # Get the HDF5 type associated with this numpy type + shape = (nparr).shape + if atom is None or atom.shape == (): + dtype_ = nparr.dtype.base + atom_ = Atom.from_dtype(dtype_) + else: + atom_ = atom + shape = shape[:-len(atom_.shape)] + self.disk_type_id = atom_to_hdf5_type(atom_, self.byteorder) + if self.disk_type_id < 0: + raise HDF5ExtError( + "Problems creating the %s: invalid disk type ID for atom %s" % ( + self.__class__.__name__, atom_)) + + # Allocate space for the dimension axis info and fill it + dims = numpy.array(shape, dtype=numpy.intp) + self.rank = len(shape) + self.dims = npy_malloc_dims(self.rank, PyArray_DATA(dims)) + rbuf = _array_data(nparr) + + # Save the array + complib = (self.filters.complib or '').encode('utf-8') + version = self._v_version.encode('utf-8') + class_ = self._c_classid.encode('utf-8') + self.dataset_id = H5ARRAYmake(self.parent_id, encoded_name, version, + self.rank, self.dims, + self.extdim, self.disk_type_id, NULL, NULL, + self.filters.complevel, complib, + self.filters.shuffle_bitshuffle, + self.filters.fletcher32, + self._want_track_times, + rbuf) + if self.dataset_id < 0: + raise HDF5ExtError("Problems creating the %s." % self.__class__.__name__) + + if self._v_file.params['PYTABLES_SYS_ATTRS']: + cset = H5T_CSET_UTF8 + # Set the conforming array attributes + H5ATTRset_attribute_string(self.dataset_id, "CLASS", class_, + len(class_), cset) + H5ATTRset_attribute_string(self.dataset_id, "VERSION", version, + len(version), cset) + H5ATTRset_attribute_string(self.dataset_id, "TITLE", encoded_title, + len(encoded_title), cset) + + # Get the native type (so that it is HDF5 who is the responsible to deal + # with non-native byteorders on-disk) + self.type_id = get_native_type(self.disk_type_id) + + return self.dataset_id, shape, atom_ + + + def _create_carray(self, object title): + cdef int i + cdef herr_t ret + cdef void *rbuf + cdef bytes complib, version, class_ + cdef ndarray dflts + cdef void *fill_data + cdef ndarray extdim + cdef object atom + cdef bytes encoded_title, encoded_name + + encoded_title = title.encode('utf-8') + encoded_name = self.name.encode('utf-8') + + atom = self.atom + self.disk_type_id = atom_to_hdf5_type(atom, self.byteorder) + + self.rank = len(self.shape) + self.dims = malloc_dims(self.shape) + if self.chunkshape: + self.dims_chunk = malloc_dims(self.chunkshape) + + rbuf = NULL # The data pointer. We don't have data to save initially + # Encode strings + complib = (self.filters.complib or '').encode('utf-8') + version = self._v_version.encode('utf-8') + class_ = self._c_classid.encode('utf-8') + + # Get the fill values + if isinstance(atom.dflt, numpy.ndarray) or atom.dflt: + dflts = numpy.array(atom.dflt, dtype=atom.dtype) + fill_data = PyArray_DATA(dflts) + else: + dflts = numpy.zeros((), dtype=atom.dtype) + fill_data = NULL + if atom.shape == (): + # The default is preferred as a scalar value instead of 0-dim array + atom.dflt = dflts[()] + else: + atom.dflt = dflts + + # Create the CArray/EArray + self.dataset_id = H5ARRAYmake(self.parent_id, encoded_name, version, + self.rank, self.dims, self.extdim, + self.disk_type_id, self.dims_chunk, + fill_data, + self.filters.complevel, complib, + self.filters.shuffle_bitshuffle, + self.filters.fletcher32, + self._want_track_times, rbuf) + if self.dataset_id < 0: + raise HDF5ExtError("Problems creating the %s." % self.__class__.__name__) + + if self._v_file.params['PYTABLES_SYS_ATTRS']: + # Set the conforming array attributes + H5ATTRset_attribute_string(self.dataset_id, "CLASS", class_, + len(class_), H5T_CSET_ASCII) + H5ATTRset_attribute_string(self.dataset_id, "VERSION", version, + len(version), H5T_CSET_ASCII) + H5ATTRset_attribute_string(self.dataset_id, "TITLE", encoded_title, + len(encoded_title), H5T_CSET_ASCII) + if self.extdim >= 0: + extdim = numpy.array([self.extdim], dtype="int32") + # Attach the EXTDIM attribute in case of enlargeable arrays + H5ATTRset_attribute(self.dataset_id, "EXTDIM", H5T_NATIVE_INT, + 0, NULL, PyArray_BYTES(extdim)) + + # Get the native type (so that it is HDF5 who is the responsible to deal + # with non-native byteorders on-disk) + self.type_id = get_native_type(self.disk_type_id) + + return self.dataset_id + + + def _open_array(self): + cdef size_t type_size, type_precision + cdef H5T_class_t class_id + cdef char cbyteorder[11] # "irrelevant" fits easily here + cdef int i + cdef int extdim + cdef herr_t ret + cdef object shape, chunkshapes, atom + cdef int fill_status + cdef ndarray dflts + cdef void *fill_data + cdef bytes encoded_name + cdef str byteorder + + encoded_name = self.name.encode('utf-8') + + # Open the dataset + self.dataset_id = H5Dopen(self.parent_id, encoded_name, H5P_DEFAULT) + if self.dataset_id < 0: + raise HDF5ExtError("Non-existing node ``%s`` under ``%s``" % + (self.name, self._v_parent._v_pathname)) + # Get the datatype handles + self.disk_type_id, self.type_id = self._get_type_ids() + # Get the atom for this type + atom = atom_from_hdf5_type(self.type_id) + + # Get the rank for this array object + if H5ARRAYget_ndims(self.dataset_id, &self.rank) < 0: + raise HDF5ExtError("Problems getting ndims!") + # Allocate space for the dimension axis info + self.dims = malloc(self.rank * sizeof(hsize_t)) + self.maxdims = malloc(self.rank * sizeof(hsize_t)) + # Get info on dimensions, class and type (of base class) + ret = H5ARRAYget_info(self.dataset_id, self.disk_type_id, + self.dims, self.maxdims, + &class_id, cbyteorder) + if ret < 0: + raise HDF5ExtError("Unable to get array info.") + + byteorder = cstr_to_pystr(cbyteorder) + + # Get the extendable dimension (if any) + self.extdim = -1 # default is non-extensible Array + for i from 0 <= i < self.rank: + if self.maxdims[i] == -1: + self.extdim = i + break + + # Get the shape as a python tuple + shape = getshape(self.rank, self.dims) + + # Allocate space for the dimension chunking info + self.dims_chunk = malloc(self.rank * sizeof(hsize_t)) + if H5ARRAYget_chunkshape(self.dataset_id, self.rank, self.dims_chunk) < 0: + # The Array class is not chunked! + chunkshapes = None + else: + # Get the chunkshape as a python tuple + chunkshapes = getshape(self.rank, self.dims_chunk) + + # object arrays should not be read directly into memory + if atom.dtype != object: + # Get the fill value + dflts = numpy.zeros((), dtype=atom.dtype) + fill_data = PyArray_DATA(dflts) + H5ARRAYget_fill_value(self.dataset_id, self.type_id, + &fill_status, fill_data); + if fill_status == H5D_FILL_VALUE_UNDEFINED: + # This can only happen with datasets created with other libraries + # than PyTables. + dflts = None + if dflts is not None and atom.shape == (): + # The default is preferred as a scalar value instead of 0-dim array + atom.dflt = dflts[()] + else: + atom.dflt = dflts + + # Get the byteorder + self.byteorder = correct_byteorder(atom.type, byteorder) + + return self.dataset_id, atom, shape, chunkshapes + + + def _append(self, ndarray nparr): + cdef int ret, extdim + cdef hsize_t *dims_arr + cdef void *rbuf + cdef object shape + + if self.atom.kind == "reference": + raise ValueError("Cannot append to the reference types") + + # Allocate space for the dimension axis info + dims_arr = npy_malloc_dims(self.rank, PyArray_DIMS(nparr)) + # Get the pointer to the buffer data area + rbuf = PyArray_DATA(nparr) + # Convert some NumPy types to HDF5 before storing. + if self.atom.type == 'time64': + self._convert_time64(nparr, 0) + + # Append the records + extdim = self.extdim + with nogil: + ret = H5ARRAYappend_records(self.dataset_id, self.type_id, self.rank, + self.dims, dims_arr, extdim, rbuf) + + if ret < 0: + raise HDF5ExtError("Problems appending the elements") + + free(dims_arr) + # Update the new dimensionality + shape = list(self.shape) + shape[self.extdim] = SizeType(self.dims[self.extdim]) + self.shape = tuple(shape) + + def _read_array(self, hsize_t start, hsize_t stop, hsize_t step, + ndarray nparr): + cdef herr_t ret + cdef void *rbuf + cdef hsize_t nrows + cdef int extdim + cdef size_t item_size = H5Tget_size(self.type_id) + cdef void * refbuf = NULL + + # Number of rows to read + nrows = get_len_of_range(start, stop, step) + + # Get the pointer to the buffer data area + if self.atom.kind == "reference": + refbuf = malloc(nrows * item_size) + rbuf = refbuf + else: + rbuf = PyArray_DATA(nparr) + + if hasattr(self, "extdim"): + extdim = self.extdim + else: + extdim = -1 + + # Do the physical read + with nogil: + ret = H5ARRAYread(self.dataset_id, self.type_id, start, nrows, step, + extdim, rbuf) + + try: + if ret < 0: + raise HDF5ExtError("Problems reading the array data.") + + # Get the pointer to the buffer data area + if self.atom.kind == "reference": + load_reference(self.dataset_id, rbuf, item_size, nparr) + finally: + if refbuf: + free(refbuf) + refbuf = NULL + + if self.atom.kind == 'time': + # Swap the byteorder by hand (this is not currently supported by HDF5) + if H5Tget_order(self.type_id) != platform_byteorder: + nparr.byteswap(True) + + # Convert some HDF5 types to NumPy after reading. + if self.atom.type == 'time64': + self._convert_time64(nparr, 1) + + return + + + def _g_read_slice(self, ndarray startl, ndarray stopl, ndarray stepl, + ndarray nparr): + cdef herr_t ret + cdef hsize_t *start + cdef hsize_t *stop + cdef hsize_t *step + cdef void *rbuf + cdef size_t item_size = H5Tget_size(self.type_id) + cdef void * refbuf = NULL + + # Get the pointer to the buffer data area of startl, stopl and stepl arrays + start = PyArray_DATA(startl) + stop = PyArray_DATA(stopl) + step = PyArray_DATA(stepl) + + # Get the pointer to the buffer data area + if self.atom.kind == "reference": + refbuf = malloc(nparr.size * item_size) + rbuf = refbuf + else: + rbuf = PyArray_DATA(nparr) + + # Do the physical read + with nogil: + ret = H5ARRAYreadSlice(self.dataset_id, self.type_id, + start, stop, step, rbuf) + try: + if ret < 0: + raise HDF5ExtError("Problems reading the array data.") + + # Get the pointer to the buffer data area + if self.atom.kind == "reference": + load_reference(self.dataset_id, rbuf, item_size, nparr) + finally: + if refbuf: + free(refbuf) + refbuf = NULL + + if self.atom.kind == 'time': + # Swap the byteorder by hand (this is not currently supported by HDF5) + if H5Tget_order(self.type_id) != platform_byteorder: + nparr.byteswap(True) + + # Convert some HDF5 types to NumPy after reading + if self.atom.type == 'time64': + self._convert_time64(nparr, 1) + + return + + + def _g_read_coords(self, ndarray coords, ndarray nparr): + """Read coordinates in an already created NumPy array.""" + + cdef herr_t ret + cdef hid_t space_id + cdef hid_t mem_space_id + cdef hsize_t size + cdef void *rbuf + cdef object mode + cdef size_t item_size = H5Tget_size(self.type_id) + cdef void * refbuf = NULL + + # Get the dataspace handle + space_id = H5Dget_space(self.dataset_id) + # Create a memory dataspace handle + size = nparr.size + mem_space_id = H5Screate_simple(1, &size, NULL) + + # Select the dataspace to be read + H5Sselect_elements(space_id, H5S_SELECT_SET, + size, PyArray_DATA(coords)) + + # Get the pointer to the buffer data area + if self.atom.kind == "reference": + refbuf = malloc(nparr.size * item_size) + rbuf = refbuf + else: + rbuf = PyArray_DATA(nparr) + + # Do the actual read + with nogil: + ret = H5Dread(self.dataset_id, self.type_id, mem_space_id, space_id, + H5P_DEFAULT, rbuf) + + try: + if ret < 0: + raise HDF5ExtError("Problems reading the array data.") + + # Get the pointer to the buffer data area + if self.atom.kind == "reference": + load_reference(self.dataset_id, rbuf, item_size, nparr) + finally: + if refbuf: + free(refbuf) + refbuf = NULL + + # Terminate access to the memory dataspace + H5Sclose(mem_space_id) + # Terminate access to the dataspace + H5Sclose(space_id) + + if self.atom.kind == 'time': + # Swap the byteorder by hand (this is not currently supported by HDF5) + if H5Tget_order(self.type_id) != platform_byteorder: + nparr.byteswap(True) + + # Convert some HDF5 types to NumPy after reading + if self.atom.type == 'time64': + self._convert_time64(nparr, 1) + + return + + + def perform_selection(self, space_id, start, count, step, idx, mode): + """Performs a selection using start/count/step in the given axis. + + All other axes have their full range selected. The selection is + added to the current `space_id` selection using the given mode. + + Note: This is a backport from the h5py project. + + """ + + cdef int select_mode + cdef ndarray start_, count_, step_ + cdef hsize_t *startp + cdef hsize_t *countp + cdef hsize_t *stepp + + # Build arrays for the selection parameters + startl, countl, stepl = [], [], [] + for i, x in enumerate(self.shape): + if i != idx: + startl.append(0) + countl.append(x) + stepl.append(1) + else: + startl.append(start) + countl.append(count) + stepl.append(step) + start_ = numpy.array(startl, dtype="i8") + count_ = numpy.array(countl, dtype="i8") + step_ = numpy.array(stepl, dtype="i8") + + # Get the pointers to array data + startp = PyArray_DATA(start_) + countp = PyArray_DATA(count_) + stepp = PyArray_DATA(step_) + + # Do the actual selection + select_modes = {"AND": H5S_SELECT_AND, "NOTB": H5S_SELECT_NOTB} + assert mode in select_modes + select_mode = select_modes[mode] + H5Sselect_hyperslab(space_id, select_mode, + startp, stepp, countp, NULL) + + def _g_read_selection(self, object selection, ndarray nparr): + """Read a selection in an already created NumPy array.""" + + cdef herr_t ret + cdef hid_t space_id + cdef hid_t mem_space_id + cdef hsize_t size + cdef void *rbuf + cdef object mode + cdef size_t item_size = H5Tget_size(self.type_id) + cdef void * refbuf = NULL + + # Get the dataspace handle + space_id = H5Dget_space(self.dataset_id) + # Create a memory dataspace handle + size = nparr.size + mem_space_id = H5Screate_simple(1, &size, NULL) + + # Select the dataspace to be read + # Start by selecting everything + H5Sselect_all(space_id) + # Now refine with outstanding selections + for args in selection: + self.perform_selection(space_id, *args) + + # Get the pointer to the buffer data area + if self.atom.kind == "reference": + refbuf = malloc(nparr.size * item_size) + rbuf = refbuf + else: + rbuf = PyArray_DATA(nparr) + + # Do the actual read + with nogil: + ret = H5Dread(self.dataset_id, self.type_id, mem_space_id, space_id, + H5P_DEFAULT, rbuf) + + try: + if ret < 0: + raise HDF5ExtError("Problems reading the array data.") + + # Get the pointer to the buffer data area + if self.atom.kind == "reference": + load_reference(self.dataset_id, rbuf, item_size, nparr) + finally: + if refbuf: + free(refbuf) + refbuf = NULL + + # Terminate access to the memory dataspace + H5Sclose(mem_space_id) + # Terminate access to the dataspace + H5Sclose(space_id) + + if self.atom.kind == 'time': + # Swap the byteorder by hand (this is not currently supported by HDF5) + if H5Tget_order(self.type_id) != platform_byteorder: + nparr.byteswap(True) + + # Convert some HDF5 types to NumPy after reading + if self.atom.type == 'time64': + self._convert_time64(nparr, 1) + + return + + + def _g_write_slice(self, ndarray startl, ndarray stepl, ndarray countl, + ndarray nparr): + """Write a slice in an already created NumPy array.""" + + cdef int ret + cdef void *rbuf + cdef void *temp + cdef hsize_t *start + cdef hsize_t *step + cdef hsize_t *count + + if self.atom.kind == "reference": + raise ValueError("Cannot write reference types yet") + # Get the pointer to the buffer data area + rbuf = PyArray_DATA(nparr) + # Get the start, step and count values + start = PyArray_DATA(startl) + step = PyArray_DATA(stepl) + count = PyArray_DATA(countl) + + # Convert some NumPy types to HDF5 before storing. + if self.atom.type == 'time64': + self._convert_time64(nparr, 0) + + # Modify the elements: + with nogil: + ret = H5ARRAYwrite_records(self.dataset_id, self.type_id, self.rank, + start, step, count, rbuf) + + if ret < 0: + raise HDF5ExtError("Internal error modifying the elements " + "(H5ARRAYwrite_records returned errorcode -%i)" % (-ret)) + + return + + + def _g_write_coords(self, ndarray coords, ndarray nparr): + """Write a selection in an already created NumPy array.""" + + cdef herr_t ret + cdef hid_t space_id + cdef hid_t mem_space_id + cdef hsize_t size + cdef void *rbuf + cdef object mode + + if self.atom.kind == "reference": + raise ValueError("Cannot write reference types yet") + # Get the dataspace handle + space_id = H5Dget_space(self.dataset_id) + # Create a memory dataspace handle + size = nparr.size + mem_space_id = H5Screate_simple(1, &size, NULL) + + # Select the dataspace to be written + H5Sselect_elements(space_id, H5S_SELECT_SET, + size, PyArray_DATA(coords)) + + # Get the pointer to the buffer data area + rbuf = PyArray_DATA(nparr) + + # Convert some NumPy types to HDF5 before storing. + if self.atom.type == 'time64': + self._convert_time64(nparr, 0) + + # Do the actual write + with nogil: + ret = H5Dwrite(self.dataset_id, self.type_id, mem_space_id, space_id, + H5P_DEFAULT, rbuf) + + if ret < 0: + raise HDF5ExtError("Problems writing the array data.") + + # Terminate access to the memory dataspace + H5Sclose(mem_space_id) + # Terminate access to the dataspace + H5Sclose(space_id) + + return + + + def _g_write_selection(self, object selection, ndarray nparr): + """Write a selection in an already created NumPy array.""" + + cdef herr_t ret + cdef hid_t space_id + cdef hid_t mem_space_id + cdef hsize_t size + cdef void *rbuf + cdef object mode + + if self.atom.kind == "reference": + raise ValueError("Cannot write reference types yet") + # Get the dataspace handle + space_id = H5Dget_space(self.dataset_id) + # Create a memory dataspace handle + size = nparr.size + mem_space_id = H5Screate_simple(1, &size, NULL) + + # Select the dataspace to be written + # Start by selecting everything + H5Sselect_all(space_id) + # Now refine with outstanding selections + for args in selection: + self.perform_selection(space_id, *args) + + # Get the pointer to the buffer data area + rbuf = PyArray_DATA(nparr) + + # Convert some NumPy types to HDF5 before storing. + if self.atom.type == 'time64': + self._convert_time64(nparr, 0) + + # Do the actual write + with nogil: + ret = H5Dwrite(self.dataset_id, self.type_id, mem_space_id, space_id, + H5P_DEFAULT, rbuf) + + if ret < 0: + raise HDF5ExtError("Problems writing the array data.") + + # Terminate access to the memory dataspace + H5Sclose(mem_space_id) + # Terminate access to the dataspace + H5Sclose(space_id) + + return + + + def __dealloc__(self): + if self.dims: + free(self.dims) + if self.maxdims: + free(self.maxdims) + if self.dims_chunk: + free(self.dims_chunk) + + +cdef class VLArray(Leaf): + # Instance variables + cdef hsize_t nrecords + + def _create_array(self, object title): + cdef int rank + cdef hsize_t *dims + cdef herr_t ret + cdef void *rbuf + cdef bytes complib, version, class_ + cdef object type_, itemsize, atom, scatom + cdef bytes encoded_title, encoded_name + cdef H5T_cset_t cset = H5T_CSET_ASCII + + encoded_title = title.encode('utf-8') + encoded_name = self.name.encode('utf-8') + + atom = self.atom + if not hasattr(atom, 'size'): # it is a pseudo-atom + atom = atom.base + + # Get the HDF5 type of the *scalar* atom + scatom = atom.copy(shape=()) + self.base_type_id = atom_to_hdf5_type(scatom, self.byteorder) + if self.base_type_id < 0: + raise HDF5ExtError( + "Problems creating the %s: invalid base type ID for atom %s" % ( + self.__class__.__name__, scatom)) + + # Allocate space for the dimension axis info + rank = len(atom.shape) + dims = malloc_dims(atom.shape) + + rbuf = NULL # We don't have data to save initially + + # Encode strings + complib = (self.filters.complib or '').encode('utf-8') + version = self._v_version.encode('utf-8') + class_ = self._c_classid.encode('utf-8') + + # Create the vlarray + self.dataset_id = H5VLARRAYmake(self.parent_id, encoded_name, version, + rank, dims, self.base_type_id, + self.chunkshape[0], rbuf, + self.filters.complevel, complib, + self.filters.shuffle_bitshuffle, + self.filters.fletcher32, + self._want_track_times, rbuf) + if dims: + free(dims) + if self.dataset_id < 0: + raise HDF5ExtError("Problems creating the VLArray.") + self.nrecords = 0 # Initialize the number of records saved + + if self._v_file.params['PYTABLES_SYS_ATTRS']: + cset = H5T_CSET_UTF8 + # Set the conforming array attributes + H5ATTRset_attribute_string(self.dataset_id, "CLASS", class_, + len(class_), cset) + H5ATTRset_attribute_string(self.dataset_id, "VERSION", version, + len(version), cset) + H5ATTRset_attribute_string(self.dataset_id, "TITLE", encoded_title, + len(encoded_title), cset) + + # Get the datatype handles + self.disk_type_id, self.type_id = self._get_type_ids() + + return self.dataset_id + + + def _open_array(self): + cdef char cbyteorder[11] # "irrelevant" fits easily here + cdef int i, enumtype + cdef int rank + cdef herr_t ret + cdef hsize_t nrecords, chunksize + cdef object shape, type_ + cdef bytes encoded_name + cdef str byteorder + + encoded_name = self.name.encode('utf-8') + + # Open the dataset + self.dataset_id = H5Dopen(self.parent_id, encoded_name, H5P_DEFAULT) + if self.dataset_id < 0: + raise HDF5ExtError("Non-existing node ``%s`` under ``%s``" % + (self.name, self._v_parent._v_pathname)) + # Get the datatype handles + self.disk_type_id, self.type_id = self._get_type_ids() + # Get the atom for this type + atom = atom_from_hdf5_type(self.type_id) + + # Get info on dimensions & types (of base class) + H5VLARRAYget_info(self.dataset_id, self.disk_type_id, &nrecords, + cbyteorder) + + byteorder = cstr_to_pystr(cbyteorder) + + # Get some properties of the atomic type + self._atomicdtype = atom.dtype + self._atomictype = atom.type + self._atomicshape = atom.shape + self._atomicsize = atom.size + + # Get the byteorder + self.byteorder = correct_byteorder(atom.type, byteorder) + + # Get the chunkshape (VLArrays are unidimensional entities) + H5ARRAYget_chunkshape(self.dataset_id, 1, &chunksize) + + self.nrecords = nrecords # Initialize the number of records saved + return self.dataset_id, SizeType(nrecords), (SizeType(chunksize),), atom + + + def _append(self, ndarray nparr, int nobjects): + cdef int ret + cdef void *rbuf + + # Get the pointer to the buffer data area + if nobjects: + rbuf = PyArray_DATA(nparr) + # Convert some NumPy types to HDF5 before storing. + if self.atom.type == 'time64': + self._convert_time64(nparr, 0) + else: + rbuf = NULL + + # Append the records: + with nogil: + ret = H5VLARRAYappend_records(self.dataset_id, self.type_id, + nobjects, self.nrecords, rbuf) + + if ret < 0: + raise HDF5ExtError("Problems appending the records.") + + self.nrecords = self.nrecords + 1 + + def _modify(self, hsize_t nrow, ndarray nparr, int nobjects): + cdef int ret + cdef void *rbuf + + # Get the pointer to the buffer data area + rbuf = PyArray_DATA(nparr) + if nobjects: + # Convert some NumPy types to HDF5 before storing. + if self.atom.type == 'time64': + self._convert_time64(nparr, 0) + + # Append the records: + with nogil: + ret = H5VLARRAYmodify_records(self.dataset_id, self.type_id, + nrow, nobjects, rbuf) + + if ret < 0: + raise HDF5ExtError("Problems modifying the record.") + + return nobjects + + # Because the size of each "row" is unknown, there is no easy way to + # calculate this value + def _get_memory_size(self): + cdef hid_t space_id + cdef hsize_t size + cdef herr_t ret + + if self.nrows == 0: + size = 0 + else: + # Get the dataspace handle + space_id = H5Dget_space(self.dataset_id) + # Return the size of the entire dataset + ret = H5Dvlen_get_buf_size(self.dataset_id, self.type_id, space_id, + &size) + if ret < 0: + size = -1 + + # Terminate access to the dataspace + H5Sclose(space_id) + + return size + + def _read_array(self, hsize_t start, hsize_t stop, hsize_t step): + cdef int i + cdef size_t vllen + cdef herr_t ret + cdef hvl_t *rdata + cdef hsize_t nrows + cdef hid_t space_id + cdef hid_t mem_space_id + cdef object buf, nparr, shape, datalist + + # Compute the number of rows to read + nrows = get_len_of_range(start, stop, step) + if start + nrows > self.nrows: + raise HDF5ExtError( + "Asking for a range of rows exceeding the available ones!.", + h5bt=False) + + # Now, read the chunk of rows + with nogil: + # Allocate the necessary memory for keeping the row handlers + rdata = malloc(nrows*sizeof(hvl_t)) + # Get the dataspace handle + space_id = H5Dget_space(self.dataset_id) + # Create a memory dataspace handle + mem_space_id = H5Screate_simple(1, &nrows, NULL) + # Select the data to be read + H5Sselect_hyperslab(space_id, H5S_SELECT_SET, &start, &step, &nrows, + NULL) + # Do the actual read + ret = H5Dread(self.dataset_id, self.type_id, mem_space_id, space_id, + H5P_DEFAULT, rdata) + + if ret < 0: + raise HDF5ExtError( + "VLArray._read_array: Problems reading the array data.") + + datalist = [] + for i from 0 <= i < nrows: + # Number of atoms in row + vllen = rdata[i].len + # Get the pointer to the buffer data area + if vllen > 0: + # Create a buffer to keep this info. It is important to do a + # copy, because we will dispose the buffer memory later on by + # calling the H5Dvlen_reclaim. PyByteArray_FromStringAndSize does this. + buf = PyByteArray_FromStringAndSize(rdata[i].p, + vllen*self._atomicsize) + else: + # Case where there is info with zero lentgh + buf = None + # Compute the shape for the read array + shape = list(self._atomicshape) + shape.insert(0, vllen) # put the length at the beginning of the shape + nparr = numpy.ndarray( + buffer=buf, dtype=self._atomicdtype.base, shape=shape) + # Set the writeable flag for this ndarray object + nparr.flags.writeable = True + if self.atom.kind == 'time': + # Swap the byteorder by hand (this is not currently supported by HDF5) + if H5Tget_order(self.type_id) != platform_byteorder: + nparr.byteswap(True) + # Convert some HDF5 types to NumPy after reading. + if self.atom.type == 'time64': + self._convert_time64(nparr, 1) + # Append this array to the output list + datalist.append(nparr) + + # Release resources + # Reclaim all the (nested) VL data + ret = H5Dvlen_reclaim(self.type_id, mem_space_id, H5P_DEFAULT, rdata) + if ret < 0: + raise HDF5ExtError("VLArray._read_array: error freeing the data buffer.") + # Terminate access to the memory dataspace + H5Sclose(mem_space_id) + # Terminate access to the dataspace + H5Sclose(space_id) + # Free the amount of row pointers to VL row data + free(rdata) + + return datalist + + + def get_row_size(self, row): + """Return the total size in bytes of all the elements contained in a given row.""" + + cdef hid_t space_id + cdef hsize_t size + cdef herr_t ret + + cdef hsize_t offset[1] + cdef hsize_t count[1] + + if row >= self.nrows: + raise HDF5ExtError( + "Asking for a range of rows exceeding the available ones!.", + h5bt=False) + + # Get the dataspace handle + space_id = H5Dget_space(self.dataset_id) + + offset[0] = row + count[0] = 1 + + ret = H5Sselect_hyperslab(space_id, H5S_SELECT_SET, offset, NULL, count, NULL); + if ret < 0: + size = -1 + + ret = H5Dvlen_get_buf_size(self.dataset_id, self.type_id, space_id, &size) + if ret < 0: + size = -1 + + # Terminate access to the dataspace + H5Sclose(space_id) + + return size + + +cdef class UnImplemented(Leaf): + + def _open_unimplemented(self): + cdef object shape + cdef char cbyteorder[11] # "irrelevant" fits easily here + cdef bytes encoded_name + cdef str byteorder + + encoded_name = self.name.encode('utf-8') + + # Get info on dimensions + shape = H5UIget_info(self.parent_id, encoded_name, cbyteorder) + shape = tuple(map(SizeType, shape)) + self.dataset_id = H5Dopen(self.parent_id, encoded_name, H5P_DEFAULT) + byteorder = cstr_to_pystr(cbyteorder) + + return (shape, byteorder, self.dataset_id) + + def _g_close(self): + H5Dclose(self.dataset_id) + + +## Local Variables: +## mode: python +## py-indent-offset: 2 +## tab-width: 2 +## fill-column: 78 +## End: diff --git a/tables/idxutils.py b/tables/idxutils.py new file mode 100644 index 0000000..6578d01 --- /dev/null +++ b/tables/idxutils.py @@ -0,0 +1,492 @@ +"""Utilities to be used mainly by the Index class.""" + +import math +import numpy as np + + +# Hints for chunk/slice/block/superblock computations: +# - The slicesize should not exceed 2**32 elements (because of +# implementation reasons). Such an extreme case would make the +# sorting algorithms to consume up to 64 GB of memory. +# - In general, one should favor a small chunksize ( < 128 KB) if one +# wants to reduce the latency for indexed queries. However, keep in +# mind that a very low value of chunksize for big datasets may hurt +# the performance by requering the HDF5 to use a lot of memory and CPU +# for its internal B-Tree. + +def csformula(nrows): + """Return the fitted chunksize (a float value) for nrows.""" + + # This formula has been computed using two points: + # 2**12 = m * 2**(n + log10(10**6)) + # 2**15 = m * 2**(n + log10(10**9)) + # where 2**12 and 2**15 are reasonable values for chunksizes for indexes + # with 10**6 and 10**9 elements respectively. + # Yes, return a floating point number! + return 64 * 2**math.log10(nrows) + + +def limit_er(expectedrows): + """Protection against creating too small or too large chunks or slices.""" + + if expectedrows < 10**5: + expectedrows = 10**5 + elif expectedrows > 10**12: + expectedrows = 10**12 + return expectedrows + + +def computechunksize(expectedrows): + """Get the optimum chunksize based on expectedrows.""" + + expectedrows = limit_er(expectedrows) + zone = int(math.log10(expectedrows)) + nrows = 10**zone + return int(csformula(nrows)) + + +def computeslicesize(expectedrows, memlevel): + """Get the optimum slicesize based on expectedrows and memorylevel.""" + + expectedrows = limit_er(expectedrows) + # First, the optimum chunksize + cs = csformula(expectedrows) + # Now, the actual chunksize + chunksize = computechunksize(expectedrows) + # The optimal slicesize + ss = int(cs * memlevel**2) + # We *need* slicesize to be an exact multiple of the actual chunksize + ss = (ss // chunksize) * chunksize + ss *= 4 # slicesize should be at least divisible by 4 + # ss cannot be bigger than 2**31 - 1 elements because of fundamental + # reasons (this limitation comes mainly from the way of compute + # indices for indexes, but also because C keysort is not implemented + # yet for the string type). Besides, it cannot be larger than + # 2**30, because limitiations of the optimized binary search code + # (in idx-opt.c, the line ``mid = lo + (hi-lo)/2;`` will overflow + # for values of ``lo`` and ``hi`` >= 2**30). Finally, ss must be a + # multiple of 4, so 2**30 must definitely be an upper limit. + if ss > 2**30: + ss = 2**30 + return ss + + +def computeblocksize(expectedrows, compoundsize, lowercompoundsize): + """Calculate the optimum number of superblocks made from compounds blocks. + + This is useful for computing the sizes of both blocks and + superblocks (using the PyTables terminology for blocks in indexes). + + """ + + nlowerblocks = (expectedrows // lowercompoundsize) + 1 + if nlowerblocks > 2**20: + # Protection against too large number of compound blocks + nlowerblocks = 2**20 + size = int(lowercompoundsize * nlowerblocks) + # We *need* superblocksize to be an exact multiple of the actual + # compoundblock size (a ceil must be performed here!) + size = ((size // compoundsize) + 1) * compoundsize + return size + + +def calc_chunksize(expectedrows, optlevel=6, indsize=4, memlevel=4, node=None): + """Calculate the HDF5 chunk size for index and sorted arrays. + + The logic to do that is based purely in experiments playing with + different chunksizes and compression flag. It is obvious that using + big chunks optimizes the I/O speed, but if they are too large, the + uncompressor takes too much time. This might (should) be further + optimized by doing more experiments. + + """ + + chunksize = computechunksize(expectedrows) + slicesize = computeslicesize(expectedrows, memlevel) + + # Avoid excessive slicesize in Indexes, see https://github.com/PyTables/PyTables/issues/879 + if node is not None: + maxsize = node._v_file.params['BUFFER_TIMES'] * node._v_file.params['IO_BUFFER_SIZE'] + while (slicesize * node.dtype.itemsize) > maxsize: + slicesize = slicesize // 2 + + # Correct the slicesize and the chunksize based on optlevel + if indsize == 1: # ultralight + chunksize, slicesize = ccs_ultralight(optlevel, chunksize, slicesize) + elif indsize == 2: # light + chunksize, slicesize = ccs_light(optlevel, chunksize, slicesize) + elif indsize == 4: # medium + chunksize, slicesize = ccs_medium(optlevel, chunksize, slicesize) + elif indsize == 8: # full + chunksize, slicesize = ccs_full(optlevel, chunksize, slicesize) + + # Finally, compute blocksize and superblocksize + blocksize = computeblocksize(expectedrows, slicesize, chunksize) + superblocksize = computeblocksize(expectedrows, blocksize, slicesize) + # The size for different blocks information + sizes = (superblocksize, blocksize, slicesize, chunksize) + return sizes + + +def ccs_ultralight(optlevel, chunksize, slicesize): + """Correct the slicesize and the chunksize based on optlevel.""" + + if optlevel in (0, 1, 2): + slicesize //= 2 + slicesize += optlevel * slicesize + elif optlevel in (3, 4, 5): + slicesize *= optlevel - 1 + elif optlevel in (6, 7, 8): + slicesize *= optlevel - 1 + elif optlevel == 9: + slicesize *= optlevel - 1 + return chunksize, slicesize + + +def ccs_light(optlevel, chunksize, slicesize): + """Correct the slicesize and the chunksize based on optlevel.""" + + if optlevel in (0, 1, 2): + slicesize //= 2 + elif optlevel in (3, 4, 5): + pass + elif optlevel in (6, 7, 8): + chunksize //= 2 + elif optlevel == 9: + # Reducing the chunksize and enlarging the slicesize is the + # best way to reduce the entropy with the current algorithm. + chunksize //= 2 + slicesize *= 2 + return chunksize, slicesize + + +def ccs_medium(optlevel, chunksize, slicesize): + """Correct the slicesize and the chunksize based on optlevel.""" + + if optlevel in (0, 1, 2): + slicesize //= 2 + elif optlevel in (3, 4, 5): + pass + elif optlevel in (6, 7, 8): + chunksize //= 2 + elif optlevel == 9: + # Reducing the chunksize and enlarging the slicesize is the + # best way to reduce the entropy with the current algorithm. + chunksize //= 2 + slicesize *= 2 + return chunksize, slicesize + + +def ccs_full(optlevel, chunksize, slicesize): + """Correct the slicesize and the chunksize based on optlevel.""" + + if optlevel in (0, 1, 2): + slicesize //= 2 + elif optlevel in (3, 4, 5): + pass + elif optlevel in (6, 7, 8): + chunksize //= 2 + elif optlevel == 9: + # Reducing the chunksize and enlarging the slicesize is the + # best way to reduce the entropy with the current algorithm. + chunksize //= 2 + slicesize *= 2 + return chunksize, slicesize + + +def calcoptlevels(nblocks, optlevel, indsize): + """Compute the optimizations to be done. + + The calculation is based on the number of blocks, optlevel and + indexing mode. + + """ + + if indsize == 2: # light + return col_light(nblocks, optlevel) + elif indsize == 4: # medium + return col_medium(nblocks, optlevel) + elif indsize == 8: # full + return col_full(nblocks, optlevel) + + +def col_light(nblocks, optlevel): + """Compute the optimizations to be done for light indexes.""" + + optmedian, optstarts, optstops, optfull = (False,) * 4 + + if 0 < optlevel <= 3: + optmedian = True + elif 3 < optlevel <= 6: + optmedian, optstarts = (True, True) + elif 6 < optlevel <= 9: + optmedian, optstarts, optstops = (True, True, True) + + return optmedian, optstarts, optstops, optfull + + +def col_medium(nblocks, optlevel): + """Compute the optimizations to be done for medium indexes.""" + + optmedian, optstarts, optstops, optfull = (False,) * 4 + + # Medium case + if nblocks <= 1: + if 0 < optlevel <= 3: + optmedian = True + elif 3 < optlevel <= 6: + optmedian, optstarts = (True, True) + elif 6 < optlevel <= 9: + optfull = 1 + else: # More than a block + if 0 < optlevel <= 3: + optfull = 1 + elif 3 < optlevel <= 6: + optfull = 2 + elif 6 < optlevel <= 9: + optfull = 3 + + return optmedian, optstarts, optstops, optfull + + +def col_full(nblocks, optlevel): + """Compute the optimizations to be done for full indexes.""" + + optmedian, optstarts, optstops, optfull = (False,) * 4 + + # Full case + if nblocks <= 1: + if 0 < optlevel <= 3: + optmedian = True + elif 3 < optlevel <= 6: + optmedian, optstarts = (True, True) + elif 6 < optlevel <= 9: + optfull = 1 + else: # More than a block + if 0 < optlevel <= 3: + optfull = 1 + elif 3 < optlevel <= 6: + optfull = 2 + elif 6 < optlevel <= 9: + optfull = 3 + + return optmedian, optstarts, optstops, optfull + + +def get_reduction_level(indsize, optlevel, slicesize, chunksize): + """Compute the reduction level based on indsize and optlevel.""" + rlevels = [ + [8, 8, 8, 8, 4, 4, 4, 2, 2, 1], # 8-bit indices (ultralight) + [4, 4, 4, 4, 2, 2, 2, 1, 1, 1], # 16-bit indices (light) + [2, 2, 2, 2, 1, 1, 1, 1, 1, 1], # 32-bit indices (medium) + [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], # 64-bit indices (full) + ] + isizes = {1: 0, 2: 1, 4: 2, 8: 3} + rlevel = rlevels[isizes[indsize]][optlevel] + # The next cases should only happen in tests + if rlevel >= slicesize: + rlevel = 1 + if slicesize <= chunksize * rlevel: + rlevel = 1 + if indsize == 8: + # Ensure that, for full indexes we will never perform a reduction. + # This is required because of implementation assumptions. + assert rlevel == 1 + return rlevel + + +# Python implementations of NextAfter and NextAfterF +# +# These implementations exist because the standard function +# nextafterf is not available on Microsoft platforms. +# +# These implementations are based on the IEEE representation of +# floats and doubles. +# Author: Shack Toms - shack@livedata.com +# +# Thanks to Shack Toms shack@livedata.com for NextAfter and NextAfterF +# implementations in Python. 2004-10-01 +# epsilon = math.ldexp(1.0, -53) # smallest double such that +# # 0.5 + epsilon != 0.5 +# epsilonF = math.ldexp(1.0, -24) # smallest float such that 0.5 + epsilonF +# != 0.5 +# maxFloat = float(2**1024 - 2**971) # From the IEEE 754 standard +# maxFloatF = float(2**128 - 2**104) # From the IEEE 754 standard +# minFloat = math.ldexp(1.0, -1022) # min positive normalized double +# minFloatF = math.ldexp(1.0, -126) # min positive normalized float +# smallEpsilon = math.ldexp(1.0, -1074) # smallest increment for +# # doubles < minFloat +# smallEpsilonF = math.ldexp(1.0, -149) # smallest increment for +# # floats < minFloatF +infinity = math.ldexp(1.0, 1023) * 2 +infinityf = math.ldexp(1.0, 128) +# Finf = float("inf") # Infinite in the IEEE 754 standard (not avail in Win) + +# A portable representation of NaN +# if sys.byteorder == "little": +# testNaN = struct.unpack("d", '\x01\x00\x00\x00\x00\x00\xf0\x7f')[0] +# elif sys.byteorder == "big": +# testNaN = struct.unpack("d", '\x7f\xf0\x00\x00\x00\x00\x00\x01')[0] +# else: +# raise ValueError("Byteorder '%s' not supported!" % sys.byteorder) +# This one seems better +# testNaN = infinity - infinity + +# "infinity" for several types +infinitymap = { + 'bool': [0, 1], + 'int8': [-2**7, 2**7 - 1], + 'uint8': [0, 2**8 - 1], + 'int16': [-2**15, 2**15 - 1], + 'uint16': [0, 2**16 - 1], + 'int32': [-2**31, 2**31 - 1], + 'uint32': [0, 2**32 - 1], + 'int64': [-2**63, 2**63 - 1], + 'uint64': [0, 2**64 - 1], + 'float32': [-infinityf, infinityf], + 'float64': [-infinity, infinity], +} + +if hasattr(np, 'float16'): + infinitymap['float16'] = [-np.float16(np.inf), + np.float16(np.inf)] +if hasattr(np, 'float96'): + infinitymap['float96'] = [-np.float96(np.inf), + np.float96(np.inf)] +if hasattr(np, 'float128'): + infinitymap['float128'] = [-np.float128(np.inf), + np.float128(np.inf)] + +# deprecated API +infinityMap = infinitymap +infinityF = infinityf + +# Utility functions + + +def inftype(dtype, itemsize, sign=+1): + """Return a superior limit for maximum representable data type.""" + + assert sign in [-1, +1] + + if dtype.kind == "S": + if sign < 0: + return b"\x00" * itemsize + else: + return b"\xff" * itemsize + try: + return infinitymap[dtype.name][sign >= 0] + except KeyError: + raise TypeError("Type %s is not supported" % dtype.name) + + +def string_next_after(x, direction, itemsize): + """Return the next representable neighbor of x in the appropriate + direction.""" + + assert direction in [-1, +1] + + # Pad the string with \x00 chars until itemsize completion + padsize = itemsize - len(x) + if padsize > 0: + x += b"\x00" * padsize + # int.to_bytes is not available in Python < 3.2 + # xlist = [i.to_bytes(1, sys.byteorder) for i in x] + xlist = [bytes([i]) for i in x] + xlist.reverse() + i = 0 + if direction > 0: + if xlist == b"\xff" * itemsize: + # Maximum value, return this + return b"".join(xlist) + for xchar in xlist: + if ord(xchar) < 0xff: + xlist[i] = chr(ord(xchar) + 1).encode('ascii') + break + else: + xlist[i] = b"\x00" + i += 1 + else: + if xlist == b"\x00" * itemsize: + # Minimum value, return this + return b"".join(xlist) + for xchar in xlist: + if ord(xchar) > 0x00: + xlist[i] = chr(ord(xchar) - 1).encode('ascii') + break + else: + xlist[i] = b"\xff" + i += 1 + xlist.reverse() + return b"".join(xlist) + + +def int_type_next_after(x, direction, itemsize): + """Return the next representable neighbor of x in the appropriate + direction.""" + + assert direction in [-1, +1] + + # x is guaranteed to be either an int or a float + if direction < 0: + if isinstance(x, int): + return x - 1 + else: + # return int(PyNextAfter(x, x - 1)) + return int(np.nextafter(x, x - 1)) + else: + if isinstance(x, int): + return x + 1 + else: + # return int(PyNextAfter(x,x + 1)) + 1 + return int(np.nextafter(x, x + 1)) + 1 + + +def bool_type_next_after(x, direction, itemsize): + """Return the next representable neighbor of x in the appropriate + direction.""" + + assert direction in [-1, +1] + + # x is guaranteed to be either a boolean + if direction < 0: + return False + else: + return True + + +def nextafter(x, direction, dtype, itemsize): + """Return the next representable neighbor of x in the appropriate + direction.""" + + assert direction in [-1, 0, +1] + assert dtype.kind == "S" or type(x) in (bool, float, int) + + if direction == 0: + return x + + if dtype.kind == "S": + return string_next_after(x, direction, itemsize) + + if dtype.kind in ['b']: + return bool_type_next_after(x, direction, itemsize) + elif dtype.kind in ['i', 'u']: + return int_type_next_after(x, direction, itemsize) + elif dtype.kind == "f": + if direction < 0: + return np.nextafter(x, x - 1) + else: + return np.nextafter(x, x + 1) + + # elif dtype.name == "float32": + # if direction < 0: + # return PyNextAfterF(x,x-1) + # else: + # return PyNextAfterF(x,x + 1) + # elif dtype.name == "float64": + # if direction < 0: + # return PyNextAfter(x,x-1) + # else: + # return PyNextAfter(x,x + 1) + + raise TypeError("data type ``%s`` is not supported" % dtype) diff --git a/tables/index.py b/tables/index.py new file mode 100644 index 0000000..e8c8caf --- /dev/null +++ b/tables/index.py @@ -0,0 +1,2186 @@ +"""Here is defined the Index class.""" + +import math +import operator +import os +import sys +import tempfile +import warnings + +from pathlib import Path +from time import perf_counter as clock +from time import process_time as cpuclock + +import numpy as np + +from .idxutils import (calc_chunksize, calcoptlevels, + get_reduction_level, nextafter, inftype) + +from . import indexesextension +from .node import NotLoggedMixin +from .atom import UIntAtom, Atom +from .earray import EArray +from .carray import CArray +from .leaf import Filters +from .indexes import CacheArray, LastRowArray, IndexArray +from .group import Group +from .path import join_path +from .exceptions import PerformanceWarning +from .utils import is_idx, idx2long, lazyattr +from .utilsextension import (nan_aware_gt, nan_aware_ge, + nan_aware_lt, nan_aware_le, + bisect_left, bisect_right) +from .lrucacheextension import ObjectCache + +# default version for INDEX objects +# obversion = "1.0" # Version of indexes in PyTables 1.x series +# obversion = "2.0" # Version of indexes in PyTables Pro 2.0 series +obversion = "2.1" # Version of indexes in PyTables Pro 2.1 and up series, +# # including the join 2.3 Std + Pro version + +debug = False +# debug = True # Uncomment this for printing sizes purposes +profile = False +# profile = True # Uncomment for profiling +if profile: + from .utils import show_stats + +# The default method for sorting +# defsort = "quicksort" +# Changing to mergesort to fix #441 +defsort = "mergesort" + +# Default policy for automatically updating indexes after a table +# append operation, or automatically reindexing after an +# index-invalidating operation like removing or modifying table rows. +default_auto_index = True +# Keep in sync with ``Table.autoindex`` docstring. + +# Default filters used to compress indexes. This is quite fast and +# compression is pretty good. +# Remember to keep these defaults in sync with the docstrings and UG. +default_index_filters = Filters(complevel=1, complib='zlib', + shuffle=True, fletcher32=False) + +# Deprecated API +defaultAutoIndex = default_auto_index +defaultIndexFilters = default_index_filters + +# The list of types for which an optimised search in cython and C has +# been implemented. Always add here the name of a new optimised type. +opt_search_types = ("int8", "int16", "int32", "int64", + "uint8", "uint16", "uint32", "uint64", + "float32", "float64") + +# The upper limit for uint32 ints +max32 = 2**32 + + +def _table_column_pathname_of_index(indexpathname): + names = indexpathname.split("/") + for i, name in enumerate(names): + if name.startswith('_i_'): + break + tablepathname = "/".join(names[:i]) + "/" + name[3:] + colpathname = "/".join(names[i + 1:]) + return (tablepathname, colpathname) + + +class Index(NotLoggedMixin, Group, indexesextension.Index): + """Represents the index of a column in a table. + + This class is used to keep the indexing information for columns in a Table + dataset (see :ref:`TableClassDescr`). It is actually a descendant of the + Group class (see :ref:`GroupClassDescr`), with some added functionality. An + Index is always associated with one and only one column in the table. + + .. note:: + + This class is mainly intended for internal use, but some of its + documented attributes and methods may be interesting for the + programmer. + + Parameters + ---------- + parentnode + The parent :class:`Group` object. + + .. versionchanged:: 3.0 + Renamed from *parentNode* to *parentnode*. + + name : str + The name of this node in its parent group. + atom : Atom + An Atom object representing the shape and type of the atomic objects to + be saved. Only scalar atoms are supported. + title + Sets a TITLE attribute of the Index entity. + kind + The desired kind for this index. The 'full' kind specifies a complete + track of the row position (64-bit), while the 'medium', 'light' or + 'ultralight' kinds only specify in which chunk the row is (using + 32-bit, 16-bit and 8-bit respectively). + optlevel + The desired optimization level for this index. + filters : Filters + An instance of the Filters class that provides information about the + desired I/O filters to be applied during the life of this object. + tmp_dir + The directory for the temporary files. + expectedrows + Represents an user estimate about the number of row slices that will be + added to the growable dimension in the IndexArray object. + byteorder + The byteorder of the index datasets *on-disk*. + blocksizes + The four main sizes of the compound blocks in index datasets (a low + level parameter). + + """ + + _c_classid = 'INDEX' + + @property + def kind(self): + """The kind of this index.""" + return {1: 'ultralight', 2: 'light', + 4: 'medium', 8: 'full'}[self.indsize] + + @property + def filters(self): + """Filter properties for this index - see Filters in + :ref:`FiltersClassDescr`.""" + return self._v_filters + + @property + def dirty(self): + """Whether the index is dirty or not. + Dirty indexes are out of sync with column data, so they exist but they + are not usable. + """ + + # If there is no ``DIRTY`` attribute, index should be clean. + return getattr(self._v_attrs, 'DIRTY', False) + + @dirty.setter + def dirty(self, dirty): + wasdirty, isdirty = self.dirty, bool(dirty) + self._v_attrs.DIRTY = dirty + # If an *actual* change in dirtiness happens, + # notify the condition cache by setting or removing a nail. + conditioncache = self.table._condition_cache + if not wasdirty and isdirty: + conditioncache.nail() + if wasdirty and not isdirty: + conditioncache.unnail() + + @property + def column(self): + """The Column (see :ref:`ColumnClassDescr`) instance for the indexed + column.""" + + tablepath, columnpath = _table_column_pathname_of_index( + self._v_pathname) + table = self._v_file._get_node(tablepath) + column = table.cols._g_col(columnpath) + return column + + @property + def table(self): + """Accessor for the `Table` object of this index.""" + tablepath, columnpath = _table_column_pathname_of_index( + self._v_pathname) + table = self._v_file._get_node(tablepath) + return table + + @property + def nblockssuperblock(self): + """The number of blocks in a superblock.""" + return self.superblocksize // self.blocksize + + @property + def nslicesblock(self): + """The number of slices in a block.""" + return self.blocksize // self.slicesize + + @property + def nchunkslice(self): + """The number of chunks in a slice.""" + return self.slicesize // self.chunksize + + @property + def nsuperblocks(self): + """The total number of superblocks in index.""" + # Last row should not be considered as a superblock + nelements = self.nelements - self.nelementsILR + nblocks = nelements // self.superblocksize + if nelements % self.blocksize > 0: + nblocks += 1 + return nblocks + + @property + def nblocks(self): + """The total number of blocks in index.""" + # Last row should not be considered as a block + nelements = self.nelements - self.nelementsILR + nblocks = nelements // self.blocksize + if nelements % self.blocksize > 0: + nblocks += 1 + return nblocks + + @property + def nslices(self): + """The number of complete slices in index.""" + return self.nelements // self.slicesize + + @property + def nchunks(self): + """The number of complete chunks in index.""" + return self.nelements // self.chunksize + + @property + def shape(self): + """The shape of this index (in slices and elements).""" + return (self.nrows, self.slicesize) + + @property + def temp_required(self): + """Whether a temporary file for indexes is required or not.""" + return (self.indsize > 1 and + self.optlevel > 0 and + self.table.nrows > self.slicesize) + + @property + def want_complete_sort(self): + """Whether we should try to build a completely sorted index or not.""" + return self.indsize == 8 and self.optlevel == 9 + + @property + def is_csi(self): + """Whether the index is completely sorted or not. + + .. versionchanged:: 3.0 + The *is_CSI* property has been renamed into *is_csi*. + + """ + + if self.nelements == 0: + # An index with 0 indexed elements is not a CSI one (by definition) + return False + if self.indsize < 8: + # An index that is not full cannot be completely sorted + return False + # Try with the 'is_csi' attribute + if 'is_csi' in self._v_attrs: + return self._v_attrs.is_csi + # If not, then compute the overlaps manually + # (the attribute 'is_csi' will be set there) + self.compute_overlaps(self, None, False) + return self.noverlaps == 0 + + @lazyattr + def nrowsinchunk(self): + """The number of rows that fits in a *table* chunk.""" + + return self.table.chunkshape[0] + + @lazyattr + def lbucket(self): + """Return the length of a bucket based index type.""" + + # Avoid to set a too large lbucket size (mainly useful for tests) + lbucket = min(self.nrowsinchunk, self.chunksize) + if self.indsize == 1: + # For ultra-light, we will never have to keep track of a + # bucket outside of a slice. + maxnb = 2**8 + if self.slicesize > maxnb * lbucket: + lbucket = math.ceil(self.slicesize / maxnb) + elif self.indsize == 2: + # For light, we will never have to keep track of a + # bucket outside of a block. + maxnb = 2**16 + if self.blocksize > maxnb * lbucket: + lbucket = math.ceil(self.blocksize / maxnb) + else: + # For medium and full indexes there should not be a need to + # increase lbucket + pass + return lbucket + + def __init__(self, parentnode, name, + atom=None, title="", + kind=None, + optlevel=None, + filters=None, + tmp_dir=None, + expectedrows=0, + byteorder=None, + blocksizes=None, + new=True): + + self._v_version = None + """The object version of this index.""" + self.optlevel = optlevel + """The optimization level for this index.""" + self.tmp_dir = tmp_dir + """The directory for the temporary files.""" + self.expectedrows = expectedrows + """The expected number of items of index arrays.""" + if byteorder in ["little", "big"]: + self.byteorder = byteorder + else: + self.byteorder = sys.byteorder + """The byteorder of the index datasets.""" + if atom is not None: + self.dtype = atom.dtype.base + self.type = atom.type + """The datatypes to be stored by the sorted index array.""" + # ############## Important note ########################### + # The datatypes saved as index values are NumPy native + # types, so we get rid of type metainfo like Time* or Enum* + # that belongs to HDF5 types (actually, this metainfo is + # not needed for sorting and looking-up purposes). + # ######################################################### + indsize = { + 'ultralight': 1, 'light': 2, 'medium': 4, 'full': 8}[kind] + assert indsize in (1, 2, 4, 8), "indsize should be 1, 2, 4 or 8!" + self.indsize = indsize + """The itemsize for the indices part of the index.""" + + self.nrows = None + """The total number of slices in the index.""" + self.nelements = None + """The number of currently indexed rows for this column.""" + self.blocksizes = blocksizes + """The four main sizes of the compound blocks (if specified).""" + self.dirtycache = True + """Dirty cache (for ranges, bounds & sorted) flag.""" + self.superblocksize = None + """Size of the superblock for this index.""" + self.blocksize = None + """Size of the block for this index.""" + self.slicesize = None + """Size of the slice for this index.""" + self.chunksize = None + """Size of the chunk for this index.""" + self.tmpfilename = None + """Filename for temporary bounds.""" + self.opt_search_types = opt_search_types + """The types for which and optimized search has been implemented.""" + self.noverlaps = -1 + """The number of overlaps in an index. 0 means a completely + sorted index. -1 means that this number is not computed yet.""" + self.tprof = 0 + """Time counter for benchmarking purposes.""" + + from .file import open_file + self._openFile = open_file + """The `open_file()` function, to avoid a circular import.""" + + super().__init__(parentnode, name, title, new, filters) + + def _g_post_init_hook(self): + if self._v_new: + # The version for newly created indexes + self._v_version = obversion + super()._g_post_init_hook() + + # Index arrays must only be created for new indexes + if not self._v_new: + idxversion = self._v_version + # Set-up some variables from info on disk and return + attrs = self._v_attrs + # Coerce NumPy scalars to Python scalars in order + # to avoid undesired upcasting operations. + self.superblocksize = int(attrs.superblocksize) + self.blocksize = int(attrs.blocksize) + self.slicesize = int(attrs.slicesize) + self.chunksize = int(attrs.chunksize) + self.blocksizes = (self.superblocksize, self.blocksize, + self.slicesize, self.chunksize) + self.optlevel = int(attrs.optlevel) + sorted = self.sorted + indices = self.indices + self.dtype = sorted.atom.dtype + self.type = sorted.atom.type + self.indsize = indices.atom.itemsize + # Some sanity checks for slicesize, chunksize and indsize + assert self.slicesize == indices.shape[1], "Wrong slicesize" + assert self.chunksize == indices._v_chunkshape[ + 1], "Wrong chunksize" + assert self.indsize in (1, 2, 4, 8), "Wrong indices itemsize" + if idxversion > "2.0": + self.reduction = int(attrs.reduction) + nelementsSLR = int(self.sortedLR.attrs.nelements) + nelementsILR = int(self.indicesLR.attrs.nelements) + else: + self.reduction = 1 + nelementsILR = self.indicesLR[-1] + nelementsSLR = nelementsILR + self.nrows = sorted.nrows + self.nelements = self.nrows * self.slicesize + nelementsILR + self.nelementsSLR = nelementsSLR + self.nelementsILR = nelementsILR + if nelementsILR > 0: + self.nrows += 1 + # Get the bounds as a cache (this has to remain here!) + rchunksize = self.chunksize // self.reduction + nboundsLR = (nelementsSLR - 1) // rchunksize + if nboundsLR < 0: + nboundsLR = 0 # correction for -1 bounds + nboundsLR += 2 # bounds + begin + end + # All bounds values (+begin + end) are at the end of sortedLR + self.bebounds = self.sortedLR[ + nelementsSLR:nelementsSLR + nboundsLR] + return + + # The index is new. Initialize the values + self.nrows = 0 + self.nelements = 0 + self.nelementsSLR = 0 + self.nelementsILR = 0 + + # The atom + atom = Atom.from_dtype(self.dtype) + + # The filters + filters = self.filters + + # Compute the superblocksize, blocksize, slicesize and chunksize values + # (in case these parameters haven't been passed to the constructor) + if self.blocksizes is None: + self.blocksizes = calc_chunksize( + self.expectedrows, self.optlevel, self.indsize, node=self) + (self.superblocksize, self.blocksize, + self.slicesize, self.chunksize) = self.blocksizes + if debug: + print("blocksizes:", self.blocksizes) + # Compute the reduction level + self.reduction = get_reduction_level( + self.indsize, self.optlevel, self.slicesize, self.chunksize) + rchunksize = self.chunksize // self.reduction + rslicesize = self.slicesize // self.reduction + + # Save them on disk as attributes + self._v_attrs.superblocksize = np.uint64(self.superblocksize) + self._v_attrs.blocksize = np.uint64(self.blocksize) + self._v_attrs.slicesize = np.uint32(self.slicesize) + self._v_attrs.chunksize = np.uint32(self.chunksize) + # Save the optlevel as well + self._v_attrs.optlevel = self.optlevel + # Save the reduction level + self._v_attrs.reduction = self.reduction + + # Create the IndexArray for sorted values + sorted = IndexArray(self, 'sorted', atom, "Sorted Values", + filters, self.byteorder) + + # Create the IndexArray for index values + IndexArray(self, 'indices', UIntAtom(itemsize=self.indsize), + "Number of chunk in table", filters, self.byteorder) + + # Create the cache for range values (1st order cache) + CacheArray(self, 'ranges', atom, (0, 2), "Range Values", filters, + self.expectedrows // self.slicesize, + byteorder=self.byteorder) + # median ranges + EArray(self, 'mranges', atom, (0,), "Median ranges", filters, + byteorder=self.byteorder, _log=False) + + # Create the cache for boundary values (2nd order cache) + nbounds_inslice = (rslicesize - 1) // rchunksize + CacheArray(self, 'bounds', atom, (0, nbounds_inslice), + "Boundary Values", filters, self.nchunks, + (1, nbounds_inslice), byteorder=self.byteorder) + + # begin, end & median bounds (only for numerical types) + EArray(self, 'abounds', atom, (0,), "Start bounds", filters, + byteorder=self.byteorder, _log=False) + EArray(self, 'zbounds', atom, (0,), "End bounds", filters, + byteorder=self.byteorder, _log=False) + EArray(self, 'mbounds', atom, (0,), "Median bounds", filters, + byteorder=self.byteorder, _log=False) + + # Create the Array for last (sorted) row values + bounds + shape = (rslicesize + 2 + nbounds_inslice,) + sortedLR = LastRowArray(self, 'sortedLR', atom, shape, + "Last Row sorted values + bounds", + filters, (rchunksize,), + byteorder=self.byteorder) + + # Create the Array for the number of chunk in last row + shape = (self.slicesize,) # enough for indexes and length + indicesLR = LastRowArray(self, 'indicesLR', + UIntAtom(itemsize=self.indsize), + shape, "Last Row indices", + filters, (self.chunksize,), + byteorder=self.byteorder) + + # The number of elements in LR will be initialized here + sortedLR.attrs.nelements = 0 + indicesLR.attrs.nelements = 0 + + # All bounds values (+begin + end) are uninitialized in creation time + self.bebounds = None + + # The starts and lengths initialization + self.starts = np.empty(shape=self.nrows, dtype=np.int32) + """Where the values fulfiling conditions starts for every slice.""" + self.lengths = np.empty(shape=self.nrows, dtype=np.int32) + """Lengths of the values fulfilling conditions for every slice.""" + + # Finally, create a temporary file for indexes if needed + if self.temp_required: + self.create_temp() + + def initial_append(self, xarr, nrow, reduction): + """Compute an initial indices arrays for data to be indexed.""" + + if profile: + tref = clock() + if profile: + show_stats("Entering initial_append", tref) + arr = xarr.pop() + indsize = self.indsize + slicesize = self.slicesize + nelementsILR = self.nelementsILR + if profile: + show_stats("Before creating idx", tref) + if indsize == 8: + idx = np.arange(0, len(arr), dtype="uint64") + nrow * slicesize + elif indsize == 4: + # For medium (32-bit) all the rows in tables should be + # directly reachable. But as len(arr) < 2**31, we can + # choose uint32 for representing indices. In this way, we + # consume far less memory during the keysort process. The + # offset will be added in self.final_idx32() later on. + # + # This optimization also prevents the values in LR to + # participate in the ``swap_chunks`` process, and this is + # the main reason to not allow the medium indexes to create + # completely sorted indexes. However, I don't find this to + # be a big limitation, as probably fully indexes are much + # more suitable for producing completely sorted indexes + # because in this case the indices part is usable for + # getting the reverse indices of the index, and I forsee + # this to be a common requirement in many operations (for + # example, in table sorts). + # + # F. Alted 2008-09-15 + idx = np.arange(0, len(arr), dtype="uint32") + else: + idx = np.empty(len(arr), "uint%d" % (indsize * 8)) + lbucket = self.lbucket + # Fill the idx with the bucket indices + offset = int(lbucket - ((nrow * (slicesize % lbucket)) % lbucket)) + idx[0:offset] = 0 + for i in range(offset, slicesize, lbucket): + idx[i:i + lbucket] = (i + lbucket - 1) // lbucket + if indsize == 2: + # Add a second offset in this case + # First normalize the number of rows + offset2 = (nrow % self.nslicesblock) * slicesize // lbucket + idx += offset2 + # Add the last row at the beginning of arr & idx (if needed) + if (indsize == 8 and nelementsILR > 0): + # It is possible that the values in LR are already sorted. + # Fetch them and override existing values in arr and idx. + assert len(arr) > nelementsILR + self.read_slice_lr(self.sortedLR, arr[:nelementsILR]) + self.read_slice_lr(self.indicesLR, idx[:nelementsILR]) + # In-place sorting + if profile: + show_stats("Before keysort", tref) + indexesextension.keysort(arr, idx) + larr = arr[-1] + if reduction > 1: + # It's important to do a copy() here in order to ensure that + # sorted._append() will receive a contiguous array. + if profile: + show_stats("Before reduction", tref) + reduc = arr[::reduction].copy() + if profile: + show_stats("After reduction", tref) + arr = reduc + if profile: + show_stats("After arr <-- reduc", tref) + # A completely sorted index is not longer possible after an + # append of an index with already one slice. + if nrow > 0: + self._v_attrs.is_csi = False + if profile: + show_stats("Exiting initial_append", tref) + return larr, arr, idx + + def final_idx32(self, idx, offset): + """Perform final operations in 32-bit indices.""" + + if profile: + tref = clock() + if profile: + show_stats("Entering final_idx32", tref) + # Do an upcast first in order to add the offset. + idx = idx.astype('uint64') + idx += offset + # The next partition is valid up to table sizes of + # 2**30 * 2**18 = 2**48 bytes, that is, 256 Tera-elements, + # which should be a safe figure, at least for a while. + idx //= self.lbucket + # After the division, we can downsize the indexes to 'uint32' + idx = idx.astype('uint32') + if profile: + show_stats("Exiting final_idx32", tref) + return idx + + def append(self, xarr, update=False): + """Append the array to the index objects.""" + + if profile: + tref = clock() + if profile: + show_stats("Entering append", tref) + if not update and self.temp_required: + where = self.tmp + # The reduction will take place *after* the optimization process + reduction = 1 + else: + where = self + reduction = self.reduction + sorted = where.sorted + indices = where.indices + ranges = where.ranges + mranges = where.mranges + bounds = where.bounds + mbounds = where.mbounds + abounds = where.abounds + zbounds = where.zbounds + sortedLR = where.sortedLR + indicesLR = where.indicesLR + nrows = sorted.nrows # before sorted.append() + larr, arr, idx = self.initial_append(xarr, nrows, reduction) + # Save the sorted array + sorted.append(arr.reshape(1, arr.size)) + cs = self.chunksize // reduction + ncs = self.nchunkslice + # Save ranges & bounds + ranges.append([[arr[0], larr]]) + bounds.append([arr[cs::cs]]) + abounds.append(arr[0::cs]) + zbounds.append(arr[cs - 1::cs]) + # Compute the medians + smedian = arr[cs // 2::cs] + mbounds.append(smedian) + mranges.append([smedian[ncs // 2]]) + if profile: + show_stats("Before deleting arr & smedian", tref) + del arr, smedian # delete references + if profile: + show_stats("After deleting arr & smedian", tref) + # Now that arr is gone, we can upcast the indices and add the offset + if self.indsize == 4: + idx = self.final_idx32(idx, nrows * self.slicesize) + indices.append(idx.reshape(1, idx.size)) + if profile: + show_stats("Before deleting idx", tref) + del idx + # Update counters after a successful append + self.nrows = nrows + 1 + self.nelements = self.nrows * self.slicesize + self.nelementsSLR = 0 # reset the counter of the last row index to 0 + self.nelementsILR = 0 # reset the counter of the last row index to 0 + # The number of elements will be saved as an attribute. + # This is necessary in case the LR arrays can remember its values + # after a possible node preemtion/reload. + sortedLR.attrs.nelements = self.nelementsSLR + indicesLR.attrs.nelements = self.nelementsILR + self.dirtycache = True # the cache is dirty now + if profile: + show_stats("Exiting append", tref) + + def append_last_row(self, xarr, update=False): + """Append the array to the last row index objects.""" + + if profile: + tref = clock() + if profile: + show_stats("Entering appendLR", tref) + # compute the elements in the last row sorted & bounds array + nrows = self.nslices + if not update and self.temp_required: + where = self.tmp + # The reduction will take place *after* the optimization process + reduction = 1 + else: + where = self + reduction = self.reduction + indicesLR = where.indicesLR + sortedLR = where.sortedLR + larr, arr, idx = self.initial_append(xarr, nrows, reduction) + nelementsSLR = len(arr) + nelementsILR = len(idx) + # Build the cache of bounds + rchunksize = self.chunksize // reduction + self.bebounds = np.concatenate((arr[::rchunksize], [larr])) + # The number of elements will be saved as an attribute + sortedLR.attrs.nelements = nelementsSLR + indicesLR.attrs.nelements = nelementsILR + # Save the number of elements, bounds and sorted values + # at the end of the sorted array + offset2 = len(self.bebounds) + sortedLR[nelementsSLR:nelementsSLR + offset2] = self.bebounds + sortedLR[:nelementsSLR] = arr + del arr + # Now that arr is gone, we can upcast the indices and add the offset + if self.indsize == 4: + idx = self.final_idx32(idx, nrows * self.slicesize) + # Save the reverse index array + indicesLR[:len(idx)] = idx + del idx + # Update counters after a successful append + self.nrows = nrows + 1 + self.nelements = nrows * self.slicesize + nelementsILR + self.nelementsILR = nelementsILR + self.nelementsSLR = nelementsSLR + self.dirtycache = True # the cache is dirty now + if profile: + show_stats("Exiting appendLR", tref) + + def optimize(self, verbose=False): + """Optimize an index so as to allow faster searches. + + verbose + If True, messages about the progress of the + optimization process are printed out. + + """ + + if not self.temp_required: + return + + if verbose: + self.verbose = True + else: + self.verbose = debug + + # Initialize last_tover and last_nover + self.last_tover = 0 + self.last_nover = 0 + + # Compute the correct optimizations for current optim level + opts = calcoptlevels(self.nblocks, self.optlevel, self.indsize) + optmedian, optstarts, optstops, optfull = opts + + if debug: + print("optvalues:", opts) + + self.create_temp2() + # Start the optimization process + while True: + if optfull: + for niter in range(optfull): + if self.swap('chunks', 'median'): + break + if self.nblocks > 1: + # Swap slices only in the case that we have + # several blocks + if self.swap('slices', 'median'): + break + if self.swap('chunks', 'median'): + break + if self.swap('chunks', 'start'): + break + if self.swap('chunks', 'stop'): + break + else: + if optmedian: + if self.swap('chunks', 'median'): + break + if optstarts: + if self.swap('chunks', 'start'): + break + if optstops: + if self.swap('chunks', 'stop'): + break + break # If we reach this, exit the loop + + # Check if we require a complete sort. Important: this step + # should be carried out *after* the optimization process has + # been completed (this is to guarantee that the complete sort + # does not take too much memory). + if self.want_complete_sort: + if self.noverlaps > 0: + self.do_complete_sort() + # Check that we have effectively achieved the complete sort + if self.noverlaps > 0: + warnings.warn( + "OPSI was not able to achieve a completely sorted index." + " Please report this to the authors.", UserWarning) + + # Close and delete the temporal optimization index file + self.cleanup_temp() + return + + def do_complete_sort(self): + """Bring an already optimized index into a complete sorted state.""" + + if self.verbose: + t1 = clock() + c1 = cpuclock() + ss = self.slicesize + tmp = self.tmp + ranges = tmp.ranges[:] + nslices = self.nslices + + nelementsLR = self.nelementsILR + if nelementsLR > 0: + # Add the ranges corresponding to the last row + rangeslr = np.array([self.bebounds[0], self.bebounds[-1]]) + ranges = np.concatenate((ranges, [rangeslr])) + nslices += 1 + + sorted = tmp.sorted + indices = tmp.indices + sortedLR = tmp.sortedLR + indicesLR = tmp.indicesLR + sremain = np.array([], dtype=self.dtype) + iremain = np.array([], dtype='u%d' % self.indsize) + starts = np.zeros(shape=nslices, dtype=np.int_) + for i in range(nslices): + # Find the overlapping elements for slice i + sover = np.array([], dtype=self.dtype) + iover = np.array([], dtype='u%d' % self.indsize) + prev_end = ranges[i, 1] + for j in range(i + 1, nslices): + stj = starts[j] + if ((j < self.nslices and stj == ss) or + (j == self.nslices and stj == nelementsLR)): + # This slice has been already dealt with + continue + if j < self.nslices: + assert stj < ss, \ + "Two slices cannot overlap completely at this stage!" + next_beg = sorted[j, stj] + else: + assert stj < nelementsLR, \ + "Two slices cannot overlap completely at this stage!" + next_beg = sortedLR[stj] + next_end = ranges[j, 1] + if prev_end > next_end: + # Complete overlapping case + if j < self.nslices: + sover = np.concatenate((sover, sorted[j, stj:])) + iover = np.concatenate((iover, indices[j, stj:])) + starts[j] = ss + else: + n = nelementsLR + sover = np.concatenate((sover, sortedLR[stj:n])) + iover = np.concatenate((iover, indicesLR[stj:n])) + starts[j] = nelementsLR + elif prev_end > next_beg: + idx = self.search_item_lt(tmp, prev_end, j, ranges[j], stj) + if j < self.nslices: + sover = np.concatenate((sover, sorted[j, stj:idx])) + iover = np.concatenate((iover, indices[j, stj:idx])) + else: + sover = np.concatenate((sover, sortedLR[stj:idx])) + iover = np.concatenate((iover, indicesLR[stj:idx])) + starts[j] = idx + # Build the extended slices to sort out + if i < self.nslices: + ssorted = np.concatenate( + (sremain, sorted[i, starts[i]:], sover)) + sindices = np.concatenate( + (iremain, indices[i, starts[i]:], iover)) + else: + ssorted = np.concatenate( + (sremain, sortedLR[starts[i]:nelementsLR], sover)) + sindices = np.concatenate( + (iremain, indicesLR[starts[i]:nelementsLR], iover)) + # Sort the extended slices + indexesextension.keysort(ssorted, sindices) + # Save the first elements of extended slices in the slice i + if i < self.nslices: + sorted[i] = ssorted[:ss] + indices[i] = sindices[:ss] + # Update caches for this slice + self.update_caches(i, ssorted[:ss]) + # Save the remaining values in a separate array + send = len(sover) + len(sremain) + sremain = ssorted[ss:ss + send] + iremain = sindices[ss:ss + send] + else: + # Still some elements remain for the last row + n = len(ssorted) + assert n == nelementsLR + send = 0 + sortedLR[:n] = ssorted + indicesLR[:n] = sindices + # Update the caches for last row + sortedlr = sortedLR[:nelementsLR] + bebounds = np.concatenate( + (sortedlr[::self.chunksize], [sortedlr[-1]])) + sortedLR[nelementsLR:nelementsLR + len(bebounds)] = bebounds + self.bebounds = bebounds + + # Verify that we have dealt with all the remaining values + assert send == 0 + + # Compute the overlaps in order to verify that we have achieved + # a complete sort. This has to be executed always (and not only + # in verbose mode!). + self.compute_overlaps(self.tmp, "do_complete_sort()", self.verbose) + if self.verbose: + print(f"time: {clock() - t1:.4f}. clock: {cpuclock() - c1:.4f}") + + def swap(self, what, mode=None): + """Swap chunks or slices using a certain bounds reference.""" + + # Thresholds for avoiding continuing the optimization + # thnover = 4 * self.slicesize # minimum number of overlapping + # # elements + thnover = 40 + thmult = 0.1 # minimum ratio of multiplicity (a 10%) + thtover = 0.01 # minimum overlaping index for slices (a 1%) + + if self.verbose: + t1 = clock() + c1 = cpuclock() + if what == "chunks": + self.swap_chunks(mode) + elif what == "slices": + self.swap_slices(mode) + if mode: + message = f"swap_{what}({mode})" + else: + message = f"swap_{what}" + (nover, mult, tover) = self.compute_overlaps( + self.tmp, message, self.verbose) + rmult = len(mult.nonzero()[0]) / len(mult) + if self.verbose: + print(f"time: {clock() - t1:.4f}. clock: {cpuclock() - c1:.4f}") + # Check that entropy is actually decreasing + if what == "chunks" and self.last_tover > 0 and self.last_nover > 0: + tover_var = (self.last_tover - tover) / self.last_tover + nover_var = (self.last_nover - nover) / self.last_nover + if tover_var < 0.05 and nover_var < 0.05: + # Less than a 5% of improvement is too few + return True + self.last_tover = tover + self.last_nover = nover + # Check if some threshold has met + if nover < thnover: + return True + if rmult < thmult: + return True + # Additional check for the overlap ratio + if 0 <= tover < thtover: + return True + return False + + def create_temp(self): + """Create some temporary objects for slice sorting purposes.""" + + # The index will be dirty during the index optimization process + self.dirty = True + # Build the name of the temporary file + fd, self.tmpfilename = tempfile.mkstemp( + ".tmp", "pytables-", self.tmp_dir) + # Close the file descriptor so as to avoid leaks + os.close(fd) + # Create the proper PyTables file + self.tmpfile = self._openFile(self.tmpfilename, "w") + self.tmp = tmp = self.tmpfile.root + cs = self.chunksize + ss = self.slicesize + filters = self.filters + # temporary sorted & indices arrays + shape = (0, ss) + atom = Atom.from_dtype(self.dtype) + EArray(tmp, 'sorted', atom, shape, + "Temporary sorted", filters, chunkshape=(1, cs)) + EArray(tmp, 'indices', UIntAtom(itemsize=self.indsize), shape, + "Temporary indices", filters, chunkshape=(1, cs)) + # temporary bounds + nbounds_inslice = (ss - 1) // cs + shape = (0, nbounds_inslice) + EArray(tmp, 'bounds', atom, shape, "Temp chunk bounds", + filters, chunkshape=(cs, nbounds_inslice)) + shape = (0,) + EArray(tmp, 'abounds', atom, shape, "Temp start bounds", + filters, chunkshape=(cs,)) + EArray(tmp, 'zbounds', atom, shape, "Temp end bounds", + filters, chunkshape=(cs,)) + EArray(tmp, 'mbounds', atom, shape, "Median bounds", + filters, chunkshape=(cs,)) + # temporary ranges + EArray(tmp, 'ranges', atom, (0, 2), + "Temporary range values", filters, chunkshape=(cs, 2)) + EArray(tmp, 'mranges', atom, (0,), + "Median ranges", filters, chunkshape=(cs,)) + # temporary last row (sorted) + shape = (ss + 2 + nbounds_inslice,) + CArray(tmp, 'sortedLR', atom, shape, + "Temp Last Row sorted values + bounds", + filters, chunkshape=(cs,)) + # temporary last row (indices) + shape = (ss,) + CArray(tmp, 'indicesLR', + UIntAtom(itemsize=self.indsize), + shape, "Temp Last Row indices", + filters, chunkshape=(cs,)) + + def create_temp2(self): + """Create some temporary objects for slice sorting purposes.""" + + # The algorithms for doing the swap can be optimized so that + # one should be necessary to create temporaries for keeping just + # the contents of a single superblock. + # F. Alted 2007-01-03 + cs = self.chunksize + ss = self.slicesize + filters = self.filters + # temporary sorted & indices arrays + shape = (self.nslices, ss) + atom = Atom.from_dtype(self.dtype) + tmp = self.tmp + CArray(tmp, 'sorted2', atom, shape, + "Temporary sorted 2", filters, chunkshape=(1, cs)) + CArray(tmp, 'indices2', UIntAtom(itemsize=self.indsize), shape, + "Temporary indices 2", filters, chunkshape=(1, cs)) + # temporary bounds + nbounds_inslice = (ss - 1) // cs + shape = (self.nslices, nbounds_inslice) + CArray(tmp, 'bounds2', atom, shape, "Temp chunk bounds 2", + filters, chunkshape=(cs, nbounds_inslice)) + shape = (self.nchunks,) + CArray(tmp, 'abounds2', atom, shape, "Temp start bounds 2", + filters, chunkshape=(cs,)) + CArray(tmp, 'zbounds2', atom, shape, "Temp end bounds 2", + filters, chunkshape=(cs,)) + CArray(tmp, 'mbounds2', atom, shape, "Median bounds 2", + filters, chunkshape=(cs,)) + # temporary ranges + CArray(tmp, 'ranges2', atom, (self.nslices, 2), + "Temporary range values 2", filters, chunkshape=(cs, 2)) + CArray(tmp, 'mranges2', atom, (self.nslices,), + "Median ranges 2", filters, chunkshape=(cs,)) + + def cleanup_temp(self): + """Copy the data and delete the temporaries for sorting purposes.""" + + if self.verbose: + print("Copying temporary data...") + # tmp -> index + reduction = self.reduction + cs = self.chunksize // reduction + ncs = self.nchunkslice + tmp = self.tmp + for i in range(self.nslices): + # Copy sorted & indices slices + sorted = tmp.sorted[i][::reduction].copy() + self.sorted.append(sorted.reshape(1, sorted.size)) + # Compute ranges + self.ranges.append([[sorted[0], sorted[-1]]]) + # Compute chunk bounds + self.bounds.append([sorted[cs::cs]]) + # Compute start, stop & median bounds and ranges + self.abounds.append(sorted[0::cs]) + self.zbounds.append(sorted[cs - 1::cs]) + smedian = sorted[cs // 2::cs] + self.mbounds.append(smedian) + self.mranges.append([smedian[ncs // 2]]) + del sorted, smedian # delete references + # Now that sorted is gone, we can copy the indices + indices = tmp.indices[i] + self.indices.append(indices.reshape(1, indices.size)) + + # Now it is the last row turn (if needed) + if self.nelementsSLR > 0: + # First, the sorted values + sortedLR = self.sortedLR + indicesLR = self.indicesLR + nelementsLR = self.nelementsILR + sortedlr = tmp.sortedLR[:nelementsLR][::reduction].copy() + nelementsSLR = len(sortedlr) + sortedLR[:nelementsSLR] = sortedlr + # Now, the bounds + self.bebounds = np.concatenate((sortedlr[::cs], [sortedlr[-1]])) + offset2 = len(self.bebounds) + sortedLR[nelementsSLR:nelementsSLR + offset2] = self.bebounds + # Finally, the indices + indicesLR[:] = tmp.indicesLR[:] + # Update the number of (reduced) sorted elements + self.nelementsSLR = nelementsSLR + # The number of elements will be saved as an attribute + self.sortedLR.attrs.nelements = self.nelementsSLR + self.indicesLR.attrs.nelements = self.nelementsILR + + if self.verbose: + print("Deleting temporaries...") + self.tmp = None + self.tmpfile.close() + Path(self.tmpfilename).unlink() + self.tmpfilename = None + + # The optimization process has finished, and the index is ok now + self.dirty = False + # ...but the memory data cache is dirty now + self.dirtycache = True + + def get_neworder(self, neworder, src_disk, tmp_disk, + lastrow, nslices, offset, dtype): + """Get sorted & indices values in new order.""" + + cs = self.chunksize + ncs = ncs2 = self.nchunkslice + self_nslices = self.nslices + tmp = np.empty(shape=self.slicesize, dtype=dtype) + for i in range(nslices): + ns = offset + i + if ns == self_nslices: + # The number of complete chunks in the last row + ncs2 = self.nelementsILR // cs + # Get slices in new order + for j in range(ncs2): + idx = neworder[i * ncs + j] + ins = idx // ncs + inc = (idx - ins * ncs) * cs + ins += offset + nc = j * cs + if ins == self_nslices: + tmp[nc:nc + cs] = lastrow[inc:inc + cs] + else: + tmp[nc:nc + cs] = src_disk[ins, inc:inc + cs] + if ns == self_nslices: + # The number of complete chunks in the last row + lastrow[:ncs2 * cs] = tmp[:ncs2 * cs] + # The elements in the last chunk of the last row will + # participate in the global reordering later on, during + # the phase of sorting of *two* slices at a time + # (including the last row slice, see + # self.reorder_slices()). The caches for last row will + # be updated in self.reorder_slices() too. + # F. Altet 2008-08-25 + else: + tmp_disk[ns] = tmp + + def swap_chunks(self, mode="median"): + """Swap & reorder the different chunks in a block.""" + + boundsnames = { + 'start': 'abounds', 'stop': 'zbounds', 'median': 'mbounds'} + tmp = self.tmp + sorted = tmp.sorted + indices = tmp.indices + tmp_sorted = tmp.sorted2 + tmp_indices = tmp.indices2 + sortedLR = tmp.sortedLR + indicesLR = tmp.indicesLR + cs = self.chunksize + ncs = self.nchunkslice + nsb = self.nslicesblock + ncb = ncs * nsb + ncb2 = ncb + boundsobj = tmp._f_get_child(boundsnames[mode]) + can_cross_bbounds = (self.indsize == 8 and self.nelementsILR > 0) + for nblock in range(self.nblocks): + # Protection for last block having less chunks than ncb + remainingchunks = self.nchunks - nblock * ncb + if remainingchunks < ncb: + ncb2 = remainingchunks + if ncb2 <= 1: + # if only zero or one chunks remains we are done + break + nslices = ncb2 // ncs + bounds = boundsobj[nblock * ncb:nblock * ncb + ncb2] + # Do this only if lastrow elements can cross block boundaries + if (nblock == self.nblocks - 1 and # last block + can_cross_bbounds): + nslices += 1 + ul = self.nelementsILR // cs + bounds = np.concatenate((bounds, self.bebounds[:ul])) + sbounds_idx = bounds.argsort(kind=defsort) + offset = int(nblock * nsb) + # Swap sorted and indices following the new order + self.get_neworder(sbounds_idx, sorted, tmp_sorted, sortedLR, + nslices, offset, self.dtype) + self.get_neworder(sbounds_idx, indices, tmp_indices, indicesLR, + nslices, offset, 'u%d' % self.indsize) + # Reorder completely the index at slice level + self.reorder_slices(tmp=True) + + def read_slice(self, where, nslice, buffer, start=0): + """Read a slice from the `where` dataset and put it in `buffer`.""" + + # Create the buffers for specifying the coordinates + self.startl = np.array([nslice, start], np.uint64) + self.stopl = np.array([nslice + 1, start + buffer.size], np.uint64) + self.stepl = np.ones(shape=2, dtype=np.uint64) + where._g_read_slice(self.startl, self.stopl, self.stepl, buffer) + + def write_slice(self, where, nslice, buffer, start=0): + """Write a `slice` to the `where` dataset with the `buffer` data.""" + + self.startl = np.array([nslice, start], np.uint64) + self.stopl = np.array([nslice + 1, start + buffer.size], np.uint64) + self.stepl = np.ones(shape=2, dtype=np.uint64) + countl = self.stopl - self.startl # (1, self.slicesize) + where._g_write_slice(self.startl, self.stepl, countl, buffer) + + # Read version for LastRow + def read_slice_lr(self, where, buffer, start=0): + """Read a slice from the `where` dataset and put it in `buffer`.""" + + startl = np.array([start], dtype=np.uint64) + stopl = np.array([start + buffer.size], dtype=np.uint64) + stepl = np.array([1], dtype=np.uint64) + where._g_read_slice(startl, stopl, stepl, buffer) + + # Write version for LastRow + def write_sliceLR(self, where, buffer, start=0): + """Write a slice from the `where` dataset with the `buffer` data.""" + + startl = np.array([start], dtype=np.uint64) + countl = np.array([start + buffer.size], dtype=np.uint64) + stepl = np.array([1], dtype=np.uint64) + where._g_write_slice(startl, stepl, countl, buffer) + + def reorder_slice(self, nslice, sorted, indices, ssorted, sindices, + tmp_sorted, tmp_indices): + """Copy & reorder the slice in source to final destination.""" + + ss = self.slicesize + # Load the second part in buffers + self.read_slice(tmp_sorted, nslice, ssorted[ss:]) + self.read_slice(tmp_indices, nslice, sindices[ss:]) + indexesextension.keysort(ssorted, sindices) + # Write the first part of the buffers to the regular leaves + self.write_slice(sorted, nslice - 1, ssorted[:ss]) + self.write_slice(indices, nslice - 1, sindices[:ss]) + # Update caches + self.update_caches(nslice - 1, ssorted[:ss]) + # Shift the slice in the end to the beginning + ssorted[:ss] = ssorted[ss:] + sindices[:ss] = sindices[ss:] + + def update_caches(self, nslice, ssorted): + """Update the caches for faster lookups.""" + + cs = self.chunksize + ncs = self.nchunkslice + tmp = self.tmp + # update first & second cache bounds (ranges & bounds) + tmp.ranges[nslice] = ssorted[[0, -1]] + tmp.bounds[nslice] = ssorted[cs::cs] + # update start & stop bounds + tmp.abounds[nslice * ncs:(nslice + 1) * ncs] = ssorted[0::cs] + tmp.zbounds[nslice * ncs:(nslice + 1) * ncs] = ssorted[cs - 1::cs] + # update median bounds + smedian = ssorted[cs // 2::cs] + tmp.mbounds[nslice * ncs:(nslice + 1) * ncs] = smedian + tmp.mranges[nslice] = smedian[ncs // 2] + + def reorder_slices(self, tmp): + """Reorder completely the index at slice level. + + This method has to maintain the locality of elements in the + ambit of ``blocks``, i.e. an element of a ``block`` cannot be + sent to another ``block`` during this reordering. This is + *critical* for ``light`` indexes to be able to use this. + + This version of reorder_slices is optimized in that *two* + complete slices are taken at a time (including the last row + slice) so as to sort them. Then, each new slice that is read is + put at the end of this two-slice buffer, while the previous one + is moved to the beginning of the buffer. This is in order to + better reduce the entropy of the regular part (i.e. all except + the last row) of the index. + + A secondary effect of this is that it takes at least *twice* of + memory than a previous version of reorder_slices() that only + reorders on a slice-by-slice basis. However, as this is more + efficient than the old version, one can configure the slicesize + to be smaller, so the memory consumption is barely similar. + + """ + + tmp = self.tmp + sorted = tmp.sorted + indices = tmp.indices + if tmp: + tmp_sorted = tmp.sorted2 + tmp_indices = tmp.indices2 + else: + tmp_sorted = tmp.sorted + tmp_indices = tmp.indices + cs = self.chunksize + ss = self.slicesize + nsb = self.blocksize // self.slicesize + nslices = self.nslices + nblocks = self.nblocks + nelementsLR = self.nelementsILR + # Create the buffer for reordering 2 slices at a time + ssorted = np.empty(shape=ss * 2, dtype=self.dtype) + sindices = np.empty(shape=ss * 2, dtype=np.dtype('u%d' % self.indsize)) + + if self.indsize == 8: + # Bootstrap the process for reordering + # Read the first slice in buffers + self.read_slice(tmp_sorted, 0, ssorted[:ss]) + self.read_slice(tmp_indices, 0, sindices[:ss]) + + nslice = 0 # Just in case the loop behind executes nothing + # Loop over the remainding slices in block + for nslice in range(1, sorted.nrows): + self.reorder_slice(nslice, sorted, indices, + ssorted, sindices, + tmp_sorted, tmp_indices) + + # End the process (enrolling the lastrow if necessary) + if nelementsLR > 0: + sortedLR = self.tmp.sortedLR + indicesLR = self.tmp.indicesLR + # Shrink the ssorted and sindices arrays to the minimum + ssorted2 = ssorted[:ss + nelementsLR] + sortedlr = ssorted2[ss:] + sindices2 = sindices[:ss + nelementsLR] + indiceslr = sindices2[ss:] + # Read the last row info in the second part of the buffer + self.read_slice_lr(sortedLR, sortedlr) + self.read_slice_lr(indicesLR, indiceslr) + indexesextension.keysort(ssorted2, sindices2) + # Write the second part of the buffers to the lastrow indices + self.write_sliceLR(sortedLR, sortedlr) + self.write_sliceLR(indicesLR, indiceslr) + # Update the caches for last row + bebounds = np.concatenate((sortedlr[::cs], [sortedlr[-1]])) + sortedLR[nelementsLR:nelementsLR + len(bebounds)] = bebounds + self.bebounds = bebounds + # Write the first part of the buffers to the regular leaves + self.write_slice(sorted, nslice, ssorted[:ss]) + self.write_slice(indices, nslice, sindices[:ss]) + # Update caches for this slice + self.update_caches(nslice, ssorted[:ss]) + else: + # Iterate over each block. No data should cross block + # boundaries to avoid adressing problems with short indices. + for nb in range(nblocks): + # Bootstrap the process for reordering + # Read the first slice in buffers + nrow = nb * nsb + self.read_slice(tmp_sorted, nrow, ssorted[:ss]) + self.read_slice(tmp_indices, nrow, sindices[:ss]) + + # Loop over the remainding slices in block + lrb = nrow + nsb + if lrb > nslices: + lrb = nslices + nslice = nrow # Just in case the loop behind executes nothing + for nslice in range(nrow + 1, lrb): + self.reorder_slice(nslice, sorted, indices, + ssorted, sindices, + tmp_sorted, tmp_indices) + + # Write the first part of the buffers to the regular leaves + self.write_slice(sorted, nslice, ssorted[:ss]) + self.write_slice(indices, nslice, sindices[:ss]) + # Update caches for this slice + self.update_caches(nslice, ssorted[:ss]) + + def swap_slices(self, mode="median"): + """Swap slices in a superblock.""" + + tmp = self.tmp + sorted = tmp.sorted + indices = tmp.indices + tmp_sorted = tmp.sorted2 + tmp_indices = tmp.indices2 + ncs = self.nchunkslice + nss = self.superblocksize // self.slicesize + nss2 = nss + for sblock in range(self.nsuperblocks): + # Protection for last superblock having less slices than nss + remainingslices = self.nslices - sblock * nss + if remainingslices < nss: + nss2 = remainingslices + if nss2 <= 1: + break + if mode == "start": + ranges = tmp.ranges[sblock * nss:sblock * nss + nss2, 0] + elif mode == "stop": + ranges = tmp.ranges[sblock * nss:sblock * nss + nss2, 1] + elif mode == "median": + ranges = tmp.mranges[sblock * nss:sblock * nss + nss2] + sranges_idx = ranges.argsort(kind=defsort) + # Don't swap the superblock at all if one doesn't need to + ndiff = (sranges_idx != np.arange(nss2)).sum() / 2 + if ndiff * 50 < nss2: + # The number of slices to rearrange is less than 2.5%, + # so skip the reordering of this superblock + # (too expensive for such a little improvement) + if self.verbose: + print("skipping reordering of superblock ->", sblock) + continue + ns = sblock * nss2 + # Swap sorted and indices slices following the new order + for i in range(nss2): + idx = sranges_idx[i] + # Swap sorted & indices slices + oi = ns + i + oidx = ns + idx + tmp_sorted[oi] = sorted[oidx] + tmp_indices[oi] = indices[oidx] + # Swap start, stop & median ranges + tmp.ranges2[oi] = tmp.ranges[oidx] + tmp.mranges2[oi] = tmp.mranges[oidx] + # Swap chunk bounds + tmp.bounds2[oi] = tmp.bounds[oidx] + # Swap start, stop & median bounds + j = oi * ncs + jn = (oi + 1) * ncs + xj = oidx * ncs + xjn = (oidx + 1) * ncs + tmp.abounds2[j:jn] = tmp.abounds[xj:xjn] + tmp.zbounds2[j:jn] = tmp.zbounds[xj:xjn] + tmp.mbounds2[j:jn] = tmp.mbounds[xj:xjn] + # tmp -> originals + for i in range(nss2): + # Copy sorted & indices slices + oi = ns + i + sorted[oi] = tmp_sorted[oi] + indices[oi] = tmp_indices[oi] + # Copy start, stop & median ranges + tmp.ranges[oi] = tmp.ranges2[oi] + tmp.mranges[oi] = tmp.mranges2[oi] + # Copy chunk bounds + tmp.bounds[oi] = tmp.bounds2[oi] + # Copy start, stop & median bounds + j = oi * ncs + jn = (oi + 1) * ncs + tmp.abounds[j:jn] = tmp.abounds2[j:jn] + tmp.zbounds[j:jn] = tmp.zbounds2[j:jn] + tmp.mbounds[j:jn] = tmp.mbounds2[j:jn] + + def search_item_lt(self, where, item, nslice, limits, start=0): + """Search a single item in a specific sorted slice.""" + + # This method will only works under the assumtion that item + # *is to be found* in the nslice. + assert nan_aware_lt(limits[0], item) and nan_aware_le(item, limits[1]) + cs = self.chunksize + ss = self.slicesize + nelementsLR = self.nelementsILR + bstart = start // cs + + # Find the chunk + if nslice < self.nslices: + nchunk = bisect_left(where.bounds[nslice], item, bstart) + else: + # We need to subtract 1 chunk here because bebounds + # has a leading value + nchunk = bisect_left(self.bebounds, item, bstart) - 1 + assert nchunk >= 0 + + # Find the element in chunk + pos = nchunk * cs + if nslice < self.nslices: + pos += bisect_left(where.sorted[nslice, pos:pos + cs], item) + assert pos <= ss + else: + end = pos + cs + if end > nelementsLR: + end = nelementsLR + pos += bisect_left(self.sortedLR[pos:end], item) + assert pos <= nelementsLR + assert pos > 0 + return pos + + def compute_overlaps_finegrain(self, where, message, verbose): + """Compute some statistics about overlaping of slices in index. + + It returns the following info: + + noverlaps : int + The total number of elements that overlaps in index. + multiplicity : array of int + The number of times that a concrete slice overlaps with any other. + toverlap : float + An ovelap index: the sum of the values in segment slices that + overlaps divided by the entire range of values. This index is only + computed for numerical types. + + """ + + ss = self.slicesize + ranges = where.ranges[:] + sorted = where.sorted + sortedLR = where.sortedLR + nslices = self.nslices + nelementsLR = self.nelementsILR + if nelementsLR > 0: + # Add the ranges corresponding to the last row + rangeslr = np.array([self.bebounds[0], self.bebounds[-1]]) + ranges = np.concatenate((ranges, [rangeslr])) + nslices += 1 + soverlap = 0 + toverlap = -1 + multiplicity = np.zeros(shape=nslices, dtype="int_") + overlaps = multiplicity.copy() + starts = multiplicity.copy() + for i in range(nslices): + prev_end = ranges[i, 1] + for j in range(i + 1, nslices): + stj = starts[j] + assert stj <= ss + if stj == ss: + # This slice has already been counted + continue + if j < self.nslices: + next_beg = sorted[j, stj] + else: + next_beg = sortedLR[stj] + next_end = ranges[j, 1] + if prev_end > next_end: + # Complete overlapping case + multiplicity[j - i] += 1 + if j < self.nslices: + overlaps[i] += ss - stj + starts[j] = ss # a sentinel + else: + overlaps[i] += nelementsLR - stj + starts[j] = nelementsLR # a sentinel + elif prev_end > next_beg: + multiplicity[j - i] += 1 + idx = self.search_item_lt( + where, prev_end, j, ranges[j], stj) + nelem = idx - stj + overlaps[i] += nelem + starts[j] = idx + if self.type != "string": + # Convert ranges into floats in order to allow + # doing operations with them without overflows + soverlap += float(ranges[i, 1]) - float(ranges[j, 0]) + + # Return the overlap as the ratio between overlaps and entire range + if self.type != "string": + erange = float(ranges[-1, 1]) - float(ranges[0, 0]) + # Check that there is an effective range of values + # Beware, erange can be negative in situations where + # the values are suffering overflow. This can happen + # specially on big signed integer values (on overflows, + # the end value will become negative!). + # Also, there is no way to compute overlap ratios for + # non-numerical types. So, be careful and always check + # that toverlap has a positive value (it must have been + # initialized to -1. before) before using it. + # F. Alted 2007-01-19 + if erange > 0: + toverlap = soverlap / erange + if verbose and message != "init": + print("toverlap (%s):" % message, toverlap) + print("multiplicity:\n", multiplicity, multiplicity.sum()) + print("overlaps:\n", overlaps, overlaps.sum()) + noverlaps = overlaps.sum() + # For full indexes, set the 'is_csi' flag + if self.indsize == 8 and self._v_file._iswritable(): + self._v_attrs.is_csi = (noverlaps == 0) + # Save the number of overlaps for future references + self.noverlaps = noverlaps + return (noverlaps, multiplicity, toverlap) + + def compute_overlaps(self, where, message, verbose): + """Compute some statistics about overlaping of slices in index. + + It returns the following info: + + noverlaps : int + The total number of slices that overlaps in index. + multiplicity : array of int + The number of times that a concrete slice overlaps with any other. + toverlap : float + An ovelap index: the sum of the values in segment slices that + overlaps divided by the entire range of values. This index is only + computed for numerical types. + + """ + + ranges = where.ranges[:] + nslices = self.nslices + if self.nelementsILR > 0: + # Add the ranges corresponding to the last row + rangeslr = np.array([self.bebounds[0], self.bebounds[-1]]) + ranges = np.concatenate((ranges, [rangeslr])) + nslices += 1 + noverlaps = 0 + soverlap = 0 + toverlap = -1 + multiplicity = np.zeros(shape=nslices, dtype="int_") + for i in range(nslices): + for j in range(i + 1, nslices): + if ranges[i, 1] > ranges[j, 0]: + noverlaps += 1 + multiplicity[j - i] += 1 + if self.type != "string": + # Convert ranges into floats in order to allow + # doing operations with them without overflows + soverlap += float(ranges[i, 1]) - float(ranges[j, 0]) + + # Return the overlap as the ratio between overlaps and entire range + if self.type != "string": + erange = float(ranges[-1, 1]) - float(ranges[0, 0]) + # Check that there is an effective range of values + # Beware, erange can be negative in situations where + # the values are suffering overflow. This can happen + # specially on big signed integer values (on overflows, + # the end value will become negative!). + # Also, there is no way to compute overlap ratios for + # non-numerical types. So, be careful and always check + # that toverlap has a positive value (it must have been + # initialized to -1. before) before using it. + # F. Altet 2007-01-19 + if erange > 0: + toverlap = soverlap / erange + if verbose: + print("overlaps (%s):" % message, noverlaps, toverlap) + print(multiplicity) + # For full indexes, set the 'is_csi' flag + if self.indsize == 8 and self._v_file._iswritable(): + self._v_attrs.is_csi = (noverlaps == 0) + # Save the number of overlaps for future references + self.noverlaps = noverlaps + return (noverlaps, multiplicity, toverlap) + + def read_sorted_indices(self, what, start, stop, step): + """Return the sorted or indices values in the specified range.""" + (start, stop, step) = self._process_range(start, stop, step) + if start >= stop: + return np.empty(0, self.dtype) + # Correction for negative values of step (reverse indices) + if step < 0: + tmp = start + start = self.nelements - stop + stop = self.nelements - tmp + if what == "sorted": + values = self.sorted + valuesLR = self.sortedLR + buffer_ = np.empty(stop - start, dtype=self.dtype) + else: + values = self.indices + valuesLR = self.indicesLR + buffer_ = np.empty(stop - start, dtype="u%d" % self.indsize) + ss = self.slicesize + nrow_start = start // ss + istart = start % ss + nrow_stop = stop // ss + tlen = stop - start + bstart = 0 + ilen = 0 + for nrow in range(nrow_start, nrow_stop + 1): + blen = ss - istart + if ilen + blen > tlen: + blen = tlen - ilen + if blen <= 0: + break + if nrow < self.nslices: + self.read_slice( + values, nrow, buffer_[bstart:bstart + blen], istart) + else: + self.read_slice_lr( + valuesLR, buffer_[bstart:bstart + blen], istart) + istart = 0 + bstart += blen + ilen += blen + return buffer_[::step] + + def read_sorted(self, start=None, stop=None, step=None): + """Return the sorted values of index in the specified range. + + The meaning of the start, stop and step arguments is the same as in + :meth:`Table.read_sorted`. + + """ + + return self.read_sorted_indices('sorted', start, stop, step) + + def read_indices(self, start=None, stop=None, step=None): + """Return the indices values of index in the specified range. + + The meaning of the start, stop and step arguments is the same as in + :meth:`Table.read_sorted`. + + """ + + return self.read_sorted_indices('indices', start, stop, step) + + def _process_range(self, start, stop, step): + """Get a range specifc for the index usage.""" + + if start is not None and stop is None: + # Special case for the behaviour of PyTables iterators + stop = idx2long(start + 1) + if start is None: + start = 0 + else: + start = idx2long(start) + if stop is None: + stop = idx2long(self.nelements) + else: + stop = idx2long(stop) + if step is None: + step = 1 + else: + step = idx2long(step) + return (start, stop, step) + + def __getitem__(self, key): + """Return the indices values of index in the specified range. + + If key argument is an integer, the corresponding index is returned. If + key is a slice, the range of indices determined by it is returned. A + negative value of step in slice is supported, meaning that the results + will be returned in reverse order. + + This method is equivalent to :meth:`Index.read_indices`. + + """ + + if is_idx(key): + key = operator.index(key) + + if key < 0: + # To support negative values + key += self.nelements + return self.read_indices(key, key + 1, 1)[0] + elif isinstance(key, slice): + return self.read_indices(key.start, key.stop, key.step) + + def __len__(self): + return self.nelements + + def restorecache(self): + """Clean the limits cache and resize starts and lengths arrays""" + + params = self._v_file.params + # The sorted IndexArray is absolutely required to be in memory + # at the same time than the Index instance, so create a strong + # reference to it. We are not introducing leaks because the + # strong reference will disappear when this Index instance is + # to be closed. + self._sorted = self.sorted + self._sorted.boundscache = ObjectCache(params['BOUNDS_MAX_SLOTS'], + params['BOUNDS_MAX_SIZE'], + 'non-opt types bounds') + self.sorted.boundscache = ObjectCache(params['BOUNDS_MAX_SLOTS'], + params['BOUNDS_MAX_SIZE'], + 'non-opt types bounds') + """A cache for the bounds (2nd hash) data. Only used for + non-optimized types searches.""" + self.limboundscache = ObjectCache(params['LIMBOUNDS_MAX_SLOTS'], + params['LIMBOUNDS_MAX_SIZE'], + 'bounding limits') + """A cache for bounding limits.""" + self.sortedLRcache = ObjectCache(params['SORTEDLR_MAX_SLOTS'], + params['SORTEDLR_MAX_SIZE'], + 'last row chunks') + """A cache for the last row chunks. Only used for searches in + the last row, and mainly useful for small indexes.""" + self.starts = np.empty(shape=self.nrows, dtype=np.int32) + self.lengths = np.empty(shape=self.nrows, dtype=np.int32) + self.sorted._init_sorted_slice(self) + self.dirtycache = False + + def search(self, item): + """Do a binary search in this index for an item.""" + + if profile: + tref = clock() + if profile: + show_stats("Entering search", tref) + + if self.dirtycache: + self.restorecache() + + # An empty item or if left limit is larger than the right one + # means that the number of records is always going to be empty, + # so we avoid further computation (including looking up the + # limits cache). + if not item or item[0] > item[1]: + self.starts[:] = 0 + self.lengths[:] = 0 + return 0 + + tlen = 0 + # Check whether the item tuple is in the limits cache or not + nslot = self.limboundscache.getslot(item) + if nslot >= 0: + startlengths = self.limboundscache.getitem(nslot) + # Reset the lengths array (not necessary for starts) + self.lengths[:] = 0 + # Now, set the interesting rows + for nrow2, start, length in startlengths: + self.starts[nrow2] = start + self.lengths[nrow2] = length + tlen = tlen + length + return tlen + # The item is not in cache. Do the real lookup. + sorted = self.sorted + if self.nslices > 0: + if self.type in self.opt_search_types: + # The next are optimizations. However, they hide the + # CPU functions consumptions from python profiles. + # You may want to de-activate them during profiling. + if self.type == "int32": + tlen = sorted._search_bin_na_i(*item) + elif self.type == "int64": + tlen = sorted._search_bin_na_ll(*item) + elif self.type == "float16": + tlen = sorted._search_bin_na_e(*item) + elif self.type == "float32": + tlen = sorted._search_bin_na_f(*item) + elif self.type == "float64": + tlen = sorted._search_bin_na_d(*item) + elif self.type == "float96": + tlen = sorted._search_bin_na_g(*item) + elif self.type == "float128": + tlen = sorted._search_bin_na_g(*item) + elif self.type == "uint32": + tlen = sorted._search_bin_na_ui(*item) + elif self.type == "uint64": + tlen = sorted._search_bin_na_ull(*item) + elif self.type == "int8": + tlen = sorted._search_bin_na_b(*item) + elif self.type == "int16": + tlen = sorted._search_bin_na_s(*item) + elif self.type == "uint8": + tlen = sorted._search_bin_na_ub(*item) + elif self.type == "uint16": + tlen = sorted._search_bin_na_us(*item) + else: + assert False, "This can't happen!" + else: + tlen = self.search_scalar(item, sorted) + # Get possible remaining values in last row + if self.nelementsSLR > 0: + # Look for more indexes in the last row + (start, stop) = self.search_last_row(item) + self.starts[-1] = start + self.lengths[-1] = stop - start + tlen += stop - start + + if self.limboundscache.couldenablecache(): + # Get a startlengths tuple and save it in cache. + # This is quite slow, but it is a good way to compress + # the bounds info. Moreover, the .couldenablecache() + # is doing a good work so as to avoid computing this + # when it is not necessary to do it. + startlengths = [] + for nrow, length in enumerate(self.lengths): + if length > 0: + startlengths.append((nrow, self.starts[nrow], length)) + # Compute the size of the recarray (aproximately) + # The +1 at the end is important to avoid 0 lengths + # (remember, the object headers take some space) + size = len(startlengths) * 8 * 2 + 1 + # Put this startlengths list in cache + self.limboundscache.setitem(item, startlengths, size) + + if profile: + show_stats("Exiting search", tref) + return tlen + + # This is an scalar version of search. It works with strings as well. + def search_scalar(self, item, sorted): + """Do a binary search in this index for an item.""" + + tlen = 0 + # Do the lookup for values fullfilling the conditions + for i in range(self.nslices): + (start, stop) = sorted._search_bin(i, item) + self.starts[i] = start + self.lengths[i] = stop - start + tlen += stop - start + return tlen + + def search_last_row(self, item): + # Variable initialization + item1, item2 = item + bebounds = self.bebounds + b0, b1 = bebounds[0], bebounds[-1] + bounds = bebounds[1:-1] + itemsize = self.dtype.itemsize + sortedLRcache = self.sortedLRcache + hi = self.nelementsSLR # maximum number of elements + rchunksize = self.chunksize // self.reduction + + nchunk = -1 + # Lookup for item1 + if nan_aware_gt(item1, b0): + if nan_aware_le(item1, b1): + # Search the appropriate chunk in bounds cache + nchunk = bisect_left(bounds, item1) + # Lookup for this chunk in cache + nslot = sortedLRcache.getslot(nchunk) + if nslot >= 0: + chunk = sortedLRcache.getitem(nslot) + else: + begin = rchunksize * nchunk + end = rchunksize * (nchunk + 1) + if end > hi: + end = hi + # Read the chunk from disk + chunk = self.sortedLR._read_sorted_slice( + self.sorted, begin, end) + # Put it in cache. It's important to *copy* + # the buffer, as it is reused in future reads! + sortedLRcache.setitem(nchunk, chunk.copy(), + (end - begin) * itemsize) + start = bisect_left(chunk, item1) + start += rchunksize * nchunk + else: + start = hi + else: + start = 0 + # Lookup for item2 + if nan_aware_ge(item2, b0): + if nan_aware_lt(item2, b1): + # Search the appropriate chunk in bounds cache + nchunk2 = bisect_right(bounds, item2) + if nchunk2 != nchunk: + # Lookup for this chunk in cache + nslot = sortedLRcache.getslot(nchunk2) + if nslot >= 0: + chunk = sortedLRcache.getitem(nslot) + else: + begin = rchunksize * nchunk2 + end = rchunksize * (nchunk2 + 1) + if end > hi: + end = hi + # Read the chunk from disk + chunk = self.sortedLR._read_sorted_slice( + self.sorted, begin, end) + # Put it in cache. It's important to *copy* + # the buffer, as it is reused in future reads! + # See bug #60 in xot.carabos.com + sortedLRcache.setitem(nchunk2, chunk.copy(), + (end - begin) * itemsize) + stop = bisect_right(chunk, item2) + stop += rchunksize * nchunk2 + else: + stop = hi + else: + stop = 0 + return (start, stop) + + def get_chunkmap(self): + """Compute a map with the interesting chunks in index.""" + + if profile: + tref = clock() + if profile: + show_stats("Entering get_chunkmap", tref) + ss = self.slicesize + nsb = self.nslicesblock + nslices = self.nslices + lbucket = self.lbucket + indsize = self.indsize + bucketsinblock = self.blocksize / lbucket + nchunks = math.ceil(self.nelements / lbucket) + chunkmap = np.zeros(shape=nchunks, dtype="bool") + reduction = self.reduction + starts = (self.starts - 1) * reduction + 1 + stops = (self.starts + self.lengths) * reduction + starts[starts < 0] = 0 # All negative values set to zero + indices = self.indices + for nslice in range(self.nrows): + start = starts[nslice] + stop = stops[nslice] + if stop > start: + idx = np.empty(shape=stop - start, dtype='u%d' % indsize) + if nslice < nslices: + indices._read_index_slice(nslice, start, stop, idx) + else: + self.indicesLR._read_index_slice(start, stop, idx) + if indsize == 8: + idx //= lbucket + elif indsize == 2: + # The chunkmap size cannot be never larger than 'int_' + idx = idx.astype("int_") + offset = int((nslice // nsb) * bucketsinblock) + idx += offset + elif indsize == 1: + # The chunkmap size cannot be never larger than 'int_' + idx = idx.astype("int_") + offset = (nslice * ss) // lbucket + idx += offset + chunkmap[idx] = True + # The case lbucket < nrowsinchunk should only happen in tests + nrowsinchunk = self.nrowsinchunk + if lbucket != nrowsinchunk: + # Map the 'coarse grain' chunkmap into the 'true' chunkmap + nelements = self.nelements + tnchunks = math.ceil(nelements / nrowsinchunk) + tchunkmap = np.zeros(shape=tnchunks, dtype="bool") + ratio = lbucket / nrowsinchunk + idx = chunkmap.nonzero()[0] + starts = (idx * ratio).astype('int_') + stops = np.ceil((idx + 1) * ratio).astype('int_') + for start, stop in zip(starts, stops): + tchunkmap[start:stop] = True + chunkmap = tchunkmap + if profile: + show_stats("Exiting get_chunkmap", tref) + return chunkmap + + def get_lookup_range(self, ops, limits): + assert len(ops) in [1, 2] + assert len(limits) in [1, 2] + assert len(ops) == len(limits) + + column = self.column + coldtype = column.dtype.base + itemsize = coldtype.itemsize + + if len(limits) == 1: + assert ops[0] in ['lt', 'le', 'eq', 'ge', 'gt'] + limit = limits[0] + op = ops[0] + if op == 'lt': + range_ = (inftype(coldtype, itemsize, sign=-1), + nextafter(limit, -1, coldtype, itemsize)) + elif op == 'le': + range_ = (inftype(coldtype, itemsize, sign=-1), + limit) + elif op == 'gt': + range_ = (nextafter(limit, +1, coldtype, itemsize), + inftype(coldtype, itemsize, sign=+1)) + elif op == 'ge': + range_ = (limit, + inftype(coldtype, itemsize, sign=+1)) + elif op == 'eq': + range_ = (limit, limit) + + elif len(limits) == 2: + assert ops[0] in ('gt', 'ge') and ops[1] in ('lt', 'le') + + lower, upper = limits + if lower > upper: + # ``a <[=] x <[=] b`` is always false if ``a > b``. + return () + + if ops == ('gt', 'lt'): # lower < col < upper + range_ = (nextafter(lower, +1, coldtype, itemsize), + nextafter(upper, -1, coldtype, itemsize)) + elif ops == ('ge', 'lt'): # lower <= col < upper + range_ = (lower, nextafter(upper, -1, coldtype, itemsize)) + elif ops == ('gt', 'le'): # lower < col <= upper + range_ = (nextafter(lower, +1, coldtype, itemsize), upper) + elif ops == ('ge', 'le'): # lower <= col <= upper + range_ = (lower, upper) + + return range_ + + def _f_remove(self, recursive=False): + """Remove this Index object.""" + + # Index removal is always recursive, + # no matter what `recursive` says. + super()._f_remove(True) + + def __str__(self): + """This provides a more compact representation than __repr__""" + + # The filters + filters = [] + if self.filters.complevel: + if self.filters.shuffle: + filters.append('shuffle') + if self.filters.bitshuffle: + filters.append('bitshuffle') + filters.append(f'{self.filters.complib}({self.filters.complevel})') + return (f"Index({self.optlevel}, " + f"{self.kind}{', '.join(filters)}).is_csi={self.is_csi}") + + def __repr__(self): + """This provides more metainfo than standard __repr__""" + + cpathname = f"{self.table._v_pathname}.cols.{self.column.pathname}" + retstr = f"""{self._v_pathname} (Index for column {cpathname}) + optlevel := {self.optlevel} + kind := {self.kind} + filters := {self.filters} + is_csi := {self.is_csi} + nelements := {self.nelements} + chunksize := {self.chunksize} + slicesize := {self.slicesize} + blocksize := {self.blocksize} + superblocksize := {self.superblocksize} + dirty := {self.dirty} + byteorder := {self.byteorder!r} + sorted := {self.sorted} + indices := {self.indices} + ranges := {self.ranges} + bounds := {self.bounds} + sortedLR := {self.sortedLR} + indicesLR := {self.indicesLR}""" + return retstr + + +class IndexesDescG(NotLoggedMixin, Group): + _c_classid = 'DINDEX' + + def _g_width_warning(self): + warnings.warn( + "the number of indexed columns on a single description group " + "is exceeding the recommended maximum (%d); " + "be ready to see PyTables asking for *lots* of memory " + "and possibly slow I/O" % self._v_max_group_width, + PerformanceWarning) + + +class IndexesTableG(NotLoggedMixin, Group): + _c_classid = 'TINDEX' + + @property + def auto(self): + if 'AUTO_INDEX' not in self._v_attrs: + return default_auto_index + return self._v_attrs.AUTO_INDEX + + @auto.setter + def auto(self, auto): + self._v_attrs.AUTO_INDEX = bool(auto) + + @auto.deleter + def auto(self): + del self._v_attrs.AUTO_INDEX + + def _g_width_warning(self): + warnings.warn( + "the number of indexed columns on a single table " + "is exceeding the recommended maximum (%d); " + "be ready to see PyTables asking for *lots* of memory " + "and possibly slow I/O" % self._v_max_group_width, + PerformanceWarning) + + def _g_check_name(self, name): + if not name.startswith('_i_'): + raise ValueError( + "names of index groups must start with ``_i_``: %s" % name) + + @property + def table(self): + """Accessor for the `Table` object of this `IndexesTableG` + container.""" + names = self._v_pathname.split("/") + tablename = names.pop()[3:] # "_i_" is at the beginning + parentpathname = "/".join(names) + tablepathname = join_path(parentpathname, tablename) + table = self._v_file._get_node(tablepathname) + return table + + +class OldIndex(NotLoggedMixin, Group): + """This is meant to hide indexes of PyTables 1.x files.""" + + _c_classid = 'CINDEX' diff --git a/tables/indexes.py b/tables/indexes.py new file mode 100644 index 0000000..78083ba --- /dev/null +++ b/tables/indexes.py @@ -0,0 +1,169 @@ +"""Here is defined the IndexArray class.""" + +from bisect import bisect_left, bisect_right + +from .node import NotLoggedMixin +from .carray import CArray +from .earray import EArray +from . import indexesextension + + +# Declarations for inheriting + + +class CacheArray(indexesextension.CacheArray, NotLoggedMixin, EArray): + """Container for keeping index caches of 1st and 2nd level.""" + + # Class identifier. + _c_classid = 'CACHEARRAY' + + +class LastRowArray(indexesextension.LastRowArray, NotLoggedMixin, CArray): + """Container for keeping sorted and indices values of last row of an + index.""" + + # Class identifier. + _c_classid = 'LASTROWARRAY' + + +class IndexArray(indexesextension.IndexArray, NotLoggedMixin, EArray): + """Represent the index (sorted or reverse index) dataset in HDF5 file. + + All NumPy typecodes are supported except for complex datatypes. + + Parameters + ---------- + parentnode + The Index class from which this object will hang off. + + .. versionchanged:: 3.0 + Renamed from *parentNode* to *parentnode*. + + name : str + The name of this node in its parent group. + atom + An Atom object representing the shape and type of the atomic objects to + be saved. Only scalar atoms are supported. + title + Sets a TITLE attribute on the array entity. + filters : Filters + An instance of the Filters class that provides information about the + desired I/O filters to be applied during the life of this object. + byteorder + The byteroder of the data on-disk. + + """ + + # Class identifier. + _c_classid = 'INDEXARRAY' + + @property + def chunksize(self): + """The chunksize for this object.""" + return self.chunkshape[1] + + @property + def slicesize(self): + """The slicesize for this object.""" + return self.shape[1] + + def __init__(self, parentnode, name, + atom=None, title="", + filters=None, byteorder=None): + """Create an IndexArray instance.""" + + self._v_pathname = parentnode._g_join(name) + if atom is not None: + # The shape and chunkshape needs to be fixed here + if name == "sorted": + reduction = parentnode.reduction + shape = (0, parentnode.slicesize // reduction) + chunkshape = (1, parentnode.chunksize // reduction) + else: + shape = (0, parentnode.slicesize) + chunkshape = (1, parentnode.chunksize) + else: + # The shape and chunkshape will be read from disk later on + shape = None + chunkshape = None + + super().__init__( + parentnode, name, atom, shape, title, filters, + chunkshape=chunkshape, byteorder=byteorder) + + # This version of searchBin uses both ranges (1st level) and + # bounds (2nd level) caches. It uses a cache for boundary rows, + # but not for 'sorted' rows (this is only supported for the + # 'optimized' types). + def _search_bin(self, nrow, item): + item1, item2 = item + result1 = -1 + result2 = -1 + hi = self.shape[1] + ranges = self._v_parent.rvcache + boundscache = self.boundscache + # First, look at the beginning of the slice + begin = ranges[nrow, 0] + # Look for items at the beginning of sorted slices + if item1 <= begin: + result1 = 0 + if item2 < begin: + result2 = 0 + if result1 >= 0 and result2 >= 0: + return (result1, result2) + # Then, look for items at the end of the sorted slice + end = ranges[nrow, 1] + if result1 < 0: + if item1 > end: + result1 = hi + if result2 < 0: + if item2 >= end: + result2 = hi + if result1 >= 0 and result2 >= 0: + return (result1, result2) + # Finally, do a lookup for item1 and item2 if they were not found + # Lookup in the middle of slice for item1 + chunksize = self.chunksize # Number of elements/chunksize + nchunk = -1 + # Try to get the bounds row from the LRU cache + nslot = boundscache.getslot(nrow) + if nslot >= 0: + # Cache hit. Use the row kept there. + bounds = boundscache.getitem(nslot) + else: + # No luck with cached data. Read the row and put it in the cache. + bounds = self._v_parent.bounds[nrow] + size = bounds.size * bounds.itemsize + boundscache.setitem(nrow, bounds, size) + if result1 < 0: + # Search the appropriate chunk in bounds cache + nchunk = bisect_left(bounds, item1) + chunk = self._read_sorted_slice(nrow, chunksize * nchunk, + chunksize * (nchunk + 1)) + result1 = indexesextension._bisect_left(chunk, item1, chunksize) + result1 += chunksize * nchunk + # Lookup in the middle of slice for item2 + if result2 < 0: + # Search the appropriate chunk in bounds cache + nchunk2 = bisect_right(bounds, item2) + if nchunk2 != nchunk: + chunk = self._read_sorted_slice(nrow, chunksize * nchunk2, + chunksize * (nchunk2 + 1)) + result2 = indexesextension._bisect_right(chunk, item2, chunksize) + result2 += chunksize * nchunk2 + return (result1, result2) + + def __str__(self): + """A compact representation of this class""" + return f"IndexArray(path={self._v_pathname})" + + def __repr__(self): + """A verbose representation of this class.""" + + return f"""{self} + atom = {self.atom!r} + shape = {self.shape} + nrows = {self.nrows} + chunksize = {self.chunksize} + slicesize = {self.slicesize} + byteorder = {self.byteorder!r}""" diff --git a/tables/indexesextension.pyx b/tables/indexesextension.pyx new file mode 100644 index 0000000..8c7e1ba --- /dev/null +++ b/tables/indexesextension.pyx @@ -0,0 +1,1531 @@ +######################################################################## +# +# License: BSD +# Created: May 18, 2006 +# Author: Francesc Alted - faltet@pytables.com +# +# $Id$ +# +######################################################################## + +"""cython interface for keeping indexes classes. + +Classes (type extensions): + + IndexArray + CacheArray + LastRowArray + +Functions: + + keysort + +Misc variables: + +""" + +import cython +import numpy +cimport numpy as cnp + +from .exceptions import HDF5ExtError +from .hdf5extension cimport Array + + +# Types, constants, functions, classes & other objects from everywhere +from numpy cimport (import_array, ndarray, npy_int8, npy_int16, npy_int32, + npy_int64, npy_uint8, npy_uint16, npy_uint32, npy_uint64, + npy_float32, npy_float64, npy_float, npy_double, + npy_longdouble, PyArray_BYTES, PyArray_DATA) + +# These two types are defined in npy_common.h but not in cython's numpy.pxd +ctypedef unsigned char npy_bool +ctypedef npy_uint16 npy_float16 + +from libc.stdlib cimport malloc, free +from libc.string cimport memcpy, strncmp + +from .definitions cimport hid_t, herr_t, hsize_t, H5Screate_simple, H5Sclose +from .lrucacheextension cimport NumCache + + + +#------------------------------------------------------------------- + +# External C functions + +# Functions for optimized operations with ARRAY for indexing purposes +cdef extern from "H5ARRAY-opt.h" nogil: + herr_t H5ARRAYOinit_readSlice( + hid_t dataset_id, hid_t *mem_space_id, hsize_t count) + herr_t H5ARRAYOread_readSlice( + hid_t dataset_id, hid_t type_id, + hsize_t irow, hsize_t start, hsize_t stop, void *data) + herr_t H5ARRAYOread_readSortedSlice( + hid_t dataset_id, hid_t mem_space_id, hid_t type_id, + hsize_t irow, hsize_t start, hsize_t stop, void *data) + herr_t H5ARRAYOread_readBoundsSlice( + hid_t dataset_id, hid_t mem_space_id, hid_t type_id, + hsize_t irow, hsize_t start, hsize_t stop, void *data) + herr_t H5ARRAYOreadSliceLR( + hid_t dataset_id, hid_t type_id, hsize_t start, hsize_t stop, void *data) + + +# Functions for optimized operations for dealing with indexes +cdef extern from "idx-opt.h" nogil: + int bisect_left_b(npy_int8 *a, long x, int hi, int offset) + int bisect_left_ub(npy_uint8 *a, long x, int hi, int offset) + int bisect_right_b(npy_int8 *a, long x, int hi, int offset) + int bisect_right_ub(npy_uint8 *a, long x, int hi, int offset) + int bisect_left_s(npy_int16 *a, long x, int hi, int offset) + int bisect_left_us(npy_uint16 *a, long x, int hi, int offset) + int bisect_right_s(npy_int16 *a, long x, int hi, int offset) + int bisect_right_us(npy_uint16 *a, long x, int hi, int offset) + int bisect_left_i(npy_int32 *a, long x, int hi, int offset) + int bisect_left_ui(npy_uint32 *a, npy_uint32 x, int hi, int offset) + int bisect_right_i(npy_int32 *a, long x, int hi, int offset) + int bisect_right_ui(npy_uint32 *a, npy_uint32 x, int hi, int offset) + int bisect_left_ll(npy_int64 *a, npy_int64 x, int hi, int offset) + int bisect_left_ull(npy_uint64 *a, npy_uint64 x, int hi, int offset) + int bisect_right_ll(npy_int64 *a, npy_int64 x, int hi, int offset) + int bisect_right_ull(npy_uint64 *a, npy_uint64 x, int hi, int offset) + int bisect_left_e(npy_float16 *a, npy_float64 x, int hi, int offset) + int bisect_right_e(npy_float16 *a, npy_float64 x, int hi, int offset) + int bisect_left_f(npy_float32 *a, npy_float64 x, int hi, int offset) + int bisect_right_f(npy_float32 *a, npy_float64 x, int hi, int offset) + int bisect_left_d(npy_float64 *a, npy_float64 x, int hi, int offset) + int bisect_right_d(npy_float64 *a, npy_float64 x, int hi, int offset) + int bisect_left_g(npy_longdouble *a, npy_longdouble x, int hi, int offset) + int bisect_right_g(npy_longdouble *a, npy_longdouble x, int hi, int offset) + + +#---------------------------------------------------------------------------- + +# Initialization code + +# The numpy API requires this function to be called before +# using any numpy facilities in an extension module. +import_array() + +#--------------------------------------------------------------------------- + +ctypedef fused floating_type: + npy_float32 + npy_float64 + npy_longdouble + + +ctypedef fused number_type: + npy_int8 + npy_int16 + npy_int32 + npy_int64 + + npy_uint8 + npy_uint16 + npy_uint32 + npy_uint64 + + npy_float32 + npy_float64 + npy_longdouble + +#=========================================================================== +# Functions +#=========================================================================== + +#--------------------------------------------------------------------------- +# keysort +#--------------------------------------------------------------------------- + +DEF PYA_QS_STACK = 100 +DEF SMALL_QUICKSORT = 15 + +def keysort(ndarray array1, ndarray array2): + """Sort array1 in-place. array2 is also sorted following the array1 order. + + array1 can be of any type, except complex or string. array2 may be made of + elements on any size. + + """ + cdef size_t size = cnp.PyArray_SIZE(array1) + cdef size_t elsize1 = cnp.PyArray_ITEMSIZE(array1) + cdef size_t elsize2 = cnp.PyArray_ITEMSIZE(array2) + cdef int type_num = cnp.PyArray_TYPE(array1) + + # floating types + if type_num == cnp.NPY_FLOAT16: + _keysort[npy_float16](PyArray_DATA(array1), PyArray_BYTES(array2), elsize2, size) + elif type_num == cnp.NPY_FLOAT32: + _keysort[npy_float32](PyArray_DATA(array1), PyArray_BYTES(array2), elsize2, size) + elif type_num == cnp.NPY_FLOAT64: + _keysort[npy_float64](PyArray_DATA(array1), PyArray_BYTES(array2), elsize2, size) + elif type_num == cnp.NPY_LONGDOUBLE: + _keysort[npy_longdouble](PyArray_DATA(array1), PyArray_BYTES(array2), elsize2, size) + # signed integer types + elif type_num == cnp.NPY_INT8: + _keysort[npy_int8](PyArray_DATA(array1), PyArray_BYTES(array2), elsize2, size) + elif type_num == cnp.NPY_INT16: + _keysort[npy_int16](PyArray_DATA(array1), PyArray_BYTES(array2), elsize2, size) + elif type_num == cnp.NPY_INT32: + _keysort[npy_int32](PyArray_DATA(array1), PyArray_BYTES(array2), elsize2, size) + elif type_num == cnp.NPY_INT64: + _keysort[npy_int64](PyArray_DATA(array1), PyArray_BYTES(array2), elsize2, size) + # unsigned integer types + elif type_num == cnp.NPY_UINT8: + _keysort[npy_uint8](PyArray_DATA(array1), PyArray_BYTES(array2), elsize2, size) + elif type_num == cnp.NPY_UINT16: + _keysort[npy_uint16](PyArray_DATA(array1), PyArray_BYTES(array2), elsize2, size) + elif type_num == cnp.NPY_UINT32: + _keysort[npy_uint32](PyArray_DATA(array1), PyArray_BYTES(array2), elsize2, size) + elif type_num == cnp.NPY_UINT64: + _keysort[npy_uint64](PyArray_DATA(array1), PyArray_BYTES(array2), elsize2, size) + # other + elif type_num == cnp.NPY_BOOL: + _keysort[npy_bool](PyArray_DATA(array1), PyArray_BYTES(array2), elsize2, size) + elif type_num == cnp.NPY_STRING: + _keysort_string(PyArray_BYTES(array1), elsize1, PyArray_BYTES(array2), elsize2, size) + else: + raise ValueError("Unknown array datatype") + + +cdef inline void swap_bytes(char *x, char *y, size_t n) nogil: + if n == 8: + (x)[0], (y)[0] = (y)[0], (x)[0] + elif n == 4: + (x)[0], (y)[0] = (y)[0], (x)[0] + elif n == 2: + (x)[0], (y)[0] = (y)[0], (x)[0] + else: + for i in range(n): + x[i], y[i] = y[i], x[i] + + +cdef inline int less_than(number_type* a, number_type* b) nogil: + if number_type in floating_type: + return a[0] < b[0] or (b[0] != b[0] and a[0] == a[0]) + else: + return a[0] < b[0] + + +@cython.cdivision(True) +cdef void _keysort(number_type* start1, char* start2, size_t elsize2, size_t n) nogil: + cdef number_type *pl = start1 + cdef number_type *pr = start1 + (n - 1) + + cdef char *ipl = start2 + cdef char *ipr = start2 + (n - 1) * elsize2 + + cdef number_type vp + cdef char *ivp = malloc(elsize2) + + cdef number_type *stack[PYA_QS_STACK] + cdef number_type **sptr = stack + + cdef char *istack[PYA_QS_STACK] + cdef char **isptr = istack + + cdef size_t stack_index = 0 + + cdef number_type *pm + cdef number_type *pi + cdef number_type *pj + cdef number_type *pt + cdef char *ipm + cdef char *ipi + cdef char *ipj + cdef char *ipt + + while True: + while pr - pl > SMALL_QUICKSORT: + pm = pl + ((pr - pl) >> 1) + ipm = ipl + ((ipr - ipl)//elsize2 >> 1)*elsize2 + + if less_than(pm, pl): + pm[0], pl[0] = pl[0], pm[0] + swap_bytes(ipm, ipl, elsize2) + + if less_than(pr, pm): + pr[0], pm[0] = pm[0], pr[0] + swap_bytes(ipr, ipm, elsize2) + + if less_than(pm, pl): + pm[0], pl[0] = pl[0], pm[0] + swap_bytes(ipm, ipl, elsize2) + + vp = pm[0] + + pi = pl + ipi = ipl + + pj = pr - 1 + ipj = ipr - elsize2 + + pm[0], pj[0] = pj[0], pm[0] + swap_bytes(ipm, ipj, elsize2) + + while True: + pi += 1 + ipi += elsize2 + while less_than(pi, &vp): + pi += 1 + ipi += elsize2 + + pj -= 1 + ipj -= elsize2 + while less_than(&vp, pj): + pj -= 1 + ipj -= elsize2 + + if pi >= pj: + break + + pi[0], pj[0] = pj[0], pi[0] + swap_bytes(ipi, ipj, elsize2) + + pi[0], (pr-1)[0] = (pr-1)[0], pi[0] + swap_bytes(ipi, ipr-elsize2, elsize2) + + # push largest partition on stack and proceed with the other + if (pi - pl) < (pr - pi): + sptr[0] = pi + 1 + sptr[1] = pr + sptr += 2 + + isptr[0] = ipi + elsize2 + isptr[1] = ipr + isptr += 2 + + pr = pi - 1 + ipr = ipi - elsize2 + else: + sptr[0] = pl + sptr[1] = pi - 1 + sptr += 2 + + isptr[0] = ipl + isptr[1] = ipi - elsize2 + isptr += 2 + + pl = pi + 1 + ipl = ipi + elsize2 + + pi = pl + 1 + ipi = ipl + elsize2 + while pi <= pr: + vp = pi[0] + memcpy(ivp, ipi, elsize2) + + pj = pi + pt = pi - 1 + + ipj = ipi + ipt = ipi - elsize2 + + while pj > pl and less_than(&vp, pt): + pj[0] = pt[0] + pj -= 1 + pt -= 1 + + memcpy(ipj, ipt, elsize2) + ipj -= elsize2 + ipt -= elsize2 + + pj[0] = vp + memcpy(ipj, ivp, elsize2) + + pi += 1 + ipi += elsize2 + + if sptr == stack: + break + + sptr -= 2 + pl = sptr[0] + pr = sptr[1] + + isptr -= 2 + ipl = isptr[0] + ipr = isptr[1] + + free(ivp) + + +@cython.cdivision(True) +cdef void _keysort_string(char* start1, size_t ss, char* start2, size_t ts, size_t n) nogil: + cdef char *pl = start1 + cdef char *pr = start1 + (n - 1) * ss + + cdef char *ipl = start2 + cdef char *ipr = start2 + (n - 1) * ts + + cdef char *vp = malloc(ss) + cdef char *ivp = malloc(ts) + + cdef char *stack[PYA_QS_STACK] + cdef char **sptr = stack + + cdef char *istack[PYA_QS_STACK] + cdef char **isptr = istack + + cdef size_t stack_index = 0 + + cdef char *pm + cdef char *pi + cdef char *pj + cdef char *pt + + cdef char *ipm + cdef char *ipi + cdef char *ipj + cdef char *ipt + + while True: + while pr - pl > SMALL_QUICKSORT * ss: + pm = pl + ((pr - pl)//ss >> 1)*ss + ipm = ipl + ((ipr - ipl)//ts >> 1)*ts + + if strncmp(pm, pl, ss) < 0: + swap_bytes(pm, pl, ss) + swap_bytes(ipm, ipl, ts) + + if strncmp(pr, pm, ss) < 0: + swap_bytes(pr, pm, ss) + swap_bytes(ipr, ipm, ts) + + if strncmp(pm, pl, ss) < 0: + swap_bytes(pm, pl, ss) + swap_bytes(ipm, ipl, ts) + + memcpy(vp, pm, ss) + + pi = pl + ipi = ipl + + pj = pr - ss + ipj = ipr - ts + + swap_bytes(pm, pj, ss) + swap_bytes(ipm, ipj, ts) + + while True: + pi += ss + ipi += ts + while strncmp(pi, vp, ss) < 0: + pi += ss + ipi += ts + + pj -= ss + ipj -= ts + while strncmp(vp, pj, ss) < 0: + pj -= ss + ipj -= ts + + if pi >= pj: + break + + swap_bytes(pi, pj, ss) + swap_bytes(ipi, ipj, ts) + + swap_bytes(pi, pr-ss, ss) + swap_bytes(ipi, ipr-ts, ts) + + # push largest partition on stack and proceed with the other + if (pi - pl) < (pr - pi): + sptr[0] = pi + ss + sptr[1] = pr + sptr += 2 + + isptr[0] = ipi + ts + isptr[1] = ipr + isptr += 2 + + pr = pi - ss + ipr = ipi - ts + else: + sptr[0] = pl + sptr[1] = pi - ss + sptr += 2 + + isptr[0] = ipl + isptr[1] = ipi - ts + isptr += 2 + + pl = pi + ss + ipl = ipi + ts + + pi = pl + ss + ipi = ipl + ts + + while pi <= pr: + memcpy(vp, pi, ss) + memcpy(ivp, ipi, ts) + + pj = pi + pt = pi - ss + + ipj = ipi + ipt = ipi - ts + + while pj > pl and strncmp(vp, pt, ss) < 0: + memcpy(pj, pt, ss) + pj -= ss + pt -= ss + + memcpy(ipj, ipt, ts) + ipj -= ts + ipt -= ts + + memcpy(pj, vp, ss) + memcpy(ipj, ivp, ts) + + pi += ss + ipi += ts + + if sptr == stack: + break + + sptr -= 2 + pl = sptr[0] + pr = sptr[1] + + isptr -= 2 + ipl = isptr[0] + ipr = isptr[1] + + free(vp) + free(ivp) + +#--------------------------------------------------------------------------- +# bisect +#--------------------------------------------------------------------------- + +# This has been copied from the standard module bisect. +# Checks for the values out of limits has been added at the beginning +# because I forsee that this should be a very common case. +# 2004-05-20 +def _bisect_left(a, x, int hi): + """Return the index where to insert item x in list a, assuming a is sorted. + + The return value i is such that all e in a[:i] have e < x, and all e in + a[i:] have e >= x. So if x already appears in the list, i points just + before the leftmost x already there. + + """ + + cdef int lo, mid + + lo = 0 + if x <= a[0]: return 0 + if a[-1] < x: return hi + while lo < hi: + mid = (lo+hi)//2 + if a[mid] < x: lo = mid+1 + else: hi = mid + return lo + + +def _bisect_right(a, x, int hi): + """Return the index where to insert item x in list a, assuming a is sorted. + + The return value i is such that all e in a[:i] have e <= x, and all e in + a[i:] have e > x. So if x already appears in the list, i points just + beyond the rightmost x already there. + + """ + + cdef int lo, mid + + lo = 0 + if x < a[0]: return 0 + if a[-1] <= x: return hi + while lo < hi: + mid = (lo+hi)//2 + if x < a[mid]: hi = mid + else: lo = mid+1 + return lo + + +#=========================================================================== +# Classes +#=========================================================================== + + + +cdef class Index: + pass + + +cdef class CacheArray(Array): + """Container for keeping index caches of 1st and 2nd level.""" + + cdef hid_t mem_space_id + + cdef initread(self, int nbounds): + # "Actions to accelerate the reads afterwards." + + # Precompute the mem_space_id + if (H5ARRAYOinit_readSlice(self.dataset_id, &self.mem_space_id, + nbounds) < 0): + raise HDF5ExtError("Problems initializing the bounds array data.") + return + + cdef read_slice(self, hsize_t nrow, hsize_t start, hsize_t stop, void *rbuf): + # "Read an slice of bounds." + + if (H5ARRAYOread_readBoundsSlice( + self.dataset_id, self.mem_space_id, self.type_id, + nrow, start, stop, rbuf) < 0): + raise HDF5ExtError("Problems reading the bounds array data.") + return + + def _g_close(self): + super()._g_close() + # Release specific resources of this class + if self.mem_space_id > 0: + H5Sclose(self.mem_space_id) + + +cdef class IndexArray(Array): + """Container for keeping sorted and indices values.""" + + cdef void *rbufst + cdef void *rbufln + cdef void *rbufrv + cdef void *rbufbc + cdef void *rbuflb + cdef hid_t mem_space_id + cdef int l_chunksize, l_slicesize, nbounds, indsize + cdef CacheArray bounds_ext + cdef NumCache boundscache, sortedcache + cdef ndarray bufferbc, bufferlb + + def _read_index_slice(self, hsize_t irow, hsize_t start, hsize_t stop, + ndarray idx): + cdef herr_t ret + cdef void *buf = PyArray_DATA(idx) + + # Do the physical read + with nogil: + ret = H5ARRAYOread_readSlice(self.dataset_id, self.type_id, + irow, start, stop, buf) + + if ret < 0: + raise HDF5ExtError("Problems reading the index indices.") + + + def _init_sorted_slice(self, index): + """Initialize the structures for doing a binary search.""" + + cdef long ndims + cdef int rank, buflen, cachesize + cdef char *bname + cdef hsize_t count[2] + cdef ndarray starts, lengths, rvcache + cdef object maxslots, rowsize + + dtype = self.atom.dtype + # Create the buffer for reading sorted data chunks if not created yet + if self.bufferlb is None: + # Internal buffers + self.bufferlb = numpy.empty(dtype=dtype, shape=self.chunksize) + # Get the pointers to the different buffer data areas + self.rbuflb = PyArray_DATA(self.bufferlb) + # Init structures for accelerating sorted array reads + rank = 2 + count[0] = 1 + count[1] = self.chunksize + self.mem_space_id = H5Screate_simple(rank, count, NULL) + # Cache some counters in local extension variables + self.l_chunksize = self.chunksize + self.l_slicesize = self.slicesize + + # Get the addresses of buffer data + starts = index.starts + lengths = index.lengths + self.rbufst = PyArray_DATA(starts) + self.rbufln = PyArray_DATA(lengths) + # The 1st cache is loaded completely in memory and needs to be reloaded + rvcache = index.ranges[:] + self.rbufrv = PyArray_DATA(rvcache) + index.rvcache = rvcache + # Init the bounds array for reading + self.nbounds = index.bounds.shape[1] + self.bounds_ext = index.bounds + self.bounds_ext.initread(self.nbounds) + if str(dtype) in self._v_parent.opt_search_types: + # The next caches should be defined only for optimized search types. + # The 2nd level cache will replace the already existing ObjectCache and + # already bound to the boundscache attribute. This way, the cache will + # not be duplicated (I know, this smells badly, but anyway). + params = self._v_file.params + rowsize = (self.bounds_ext._v_chunkshape[1] * dtype.itemsize) + maxslots = params['BOUNDS_MAX_SIZE'] // rowsize + self.boundscache = NumCache( + (maxslots, self.nbounds), dtype, 'non-opt types bounds') + self.bufferbc = numpy.empty(dtype=dtype, shape=self.nbounds) + # Get the pointer for the internal buffer for 2nd level cache + self.rbufbc = PyArray_DATA(self.bufferbc) + # Another NumCache for the sorted values + rowsize = (self.chunksize*dtype.itemsize) + maxslots = params['SORTED_MAX_SIZE'] // (self.chunksize*dtype.itemsize) + self.sortedcache = NumCache( + (maxslots, self.chunksize), dtype, 'sorted') + + + + cdef void *_g_read_sorted_slice(self, hsize_t irow, hsize_t start, + hsize_t stop): + """Read the sorted part of an index.""" + + with nogil: + ret = H5ARRAYOread_readSortedSlice( + self.dataset_id, self.mem_space_id, self.type_id, + irow, start, stop, self.rbuflb) + + if ret < 0: + raise HDF5ExtError("Problems reading the array data.") + + return self.rbuflb + + # can't time machine since this function is cdef'd + #_g_read_sorted_slice = prveious_api(_g_read_sorted_slice) + + # This is callable from python + def _read_sorted_slice(self, hsize_t irow, hsize_t start, hsize_t stop): + """Read the sorted part of an index.""" + + self._g_read_sorted_slice(irow, start, stop) + return self.bufferlb + + + cdef void *get_lru_bounds(self, int nrow, int nbounds): + """Get the bounds from the cache, or read them.""" + + cdef void *vpointer + cdef long nslot + + nslot = self.boundscache.getslot_(nrow) + if nslot >= 0: + vpointer = self.boundscache.getitem1_(nslot) + else: + # Bounds row is not in cache. Read it and put it in the LRU cache. + self.bounds_ext.read_slice(nrow, 0, nbounds, self.rbufbc) + self.boundscache.setitem_(nrow, self.rbufbc, 0) + vpointer = self.rbufbc + return vpointer + + # can't time machine since get_lru_bounds() function is cdef'd + + cdef void *get_lru_sorted(self, int nrow, int ncs, int nchunk, int cs): + """Get the sorted row from the cache or read it.""" + + cdef void *vpointer + cdef npy_int64 nckey + cdef long nslot + cdef hsize_t start, stop + + # Compute the number of chunk read and use it as the key for the cache. + nckey = nrow*ncs+nchunk + nslot = self.sortedcache.getslot_(nckey) + if nslot >= 0: + vpointer = self.sortedcache.getitem1_(nslot) + else: + # The sorted chunk is not in cache. Read it and put it in the LRU cache. + start = cs*nchunk + stop = cs*(nchunk+1) + vpointer = self._g_read_sorted_slice(nrow, start, stop) + self.sortedcache.setitem_(nckey, vpointer, 0) + return vpointer + + # can't time machine since get_lru_sorted() function is cdef'd + + # Optimized version for int8 + def _search_bin_na_b(self, long item1, long item2): + cdef int cs, ss, ncs, nrow, nrows, nbounds, rvrow + cdef int start, stop, tlength, length, bread, nchunk, nchunk2 + cdef int *rbufst + cdef int *rbufln + + # Variables with specific type + cdef npy_int8 *rbufrv + cdef npy_int8 *rbufbc = NULL + cdef npy_int8 *rbuflb = NULL + + cs = self.l_chunksize + ss = self.l_slicesize + ncs = ss // cs + nbounds = self.nbounds + nrows = self.nrows + rbufst = self.rbufst + rbufln = self.rbufln + rbufrv = self.rbufrv + tlength = 0 + for nrow from 0 <= nrow < nrows: + rvrow = nrow*2 + bread = 0 + nchunk = -1 + + # Look if item1 is in this row + if item1 > rbufrv[rvrow]: + if item1 <= rbufrv[rvrow+1]: + # Get the bounds row from the LRU cache or read them. + rbufbc = self.get_lru_bounds(nrow, nbounds) + bread = 1 + nchunk = bisect_left_b(rbufbc, item1, nbounds, 0) + # Get the sorted row from the LRU cache or read it. + rbuflb = self.get_lru_sorted(nrow, ncs, nchunk, cs) + start = bisect_left_b(rbuflb, item1, cs, 0) + cs*nchunk + else: + start = ss + else: + start = 0 + # Now, for item2 + if item2 >= rbufrv[rvrow]: + if item2 < rbufrv[rvrow+1]: + if not bread: + # Get the bounds row from the LRU cache or read them. + rbufbc = self.get_lru_bounds(nrow, nbounds) + nchunk2 = bisect_right_b(rbufbc, item2, nbounds, 0) + if nchunk2 <> nchunk: + # Get the sorted row from the LRU cache or read it. + rbuflb = self.get_lru_sorted(nrow, ncs, nchunk2, cs) + stop = bisect_right_b(rbuflb, item2, cs, 0) + cs*nchunk2 + else: + stop = ss + else: + stop = 0 + length = stop - start + tlength = tlength + length + rbufst[nrow] = start + rbufln[nrow] = length + return tlength + + + # Optimized version for uint8 + def _search_bin_na_ub(self, long item1, long item2): + cdef int cs, ss, ncs, nrow, nrows, nbounds, rvrow + cdef int start, stop, tlength, length, bread, nchunk, nchunk2 + cdef int *rbufst + cdef int *rbufln + + # Variables with specific type + cdef npy_uint8 *rbufrv + cdef npy_uint8 *rbufbc = NULL + cdef npy_uint8 *rbuflb = NULL + + cs = self.l_chunksize + ss = self.l_slicesize + ncs = ss // cs + nbounds = self.nbounds + nrows = self.nrows + rbufst = self.rbufst + rbufln = self.rbufln + rbufrv = self.rbufrv + tlength = 0 + for nrow from 0 <= nrow < nrows: + rvrow = nrow*2 + bread = 0 + nchunk = -1 + + # Look if item1 is in this row + if item1 > rbufrv[rvrow]: + if item1 <= rbufrv[rvrow+1]: + # Get the bounds row from the LRU cache or read them. + rbufbc = self.get_lru_bounds(nrow, nbounds) + bread = 1 + nchunk = bisect_left_ub(rbufbc, item1, nbounds, 0) + # Get the sorted row from the LRU cache or read it. + rbuflb = self.get_lru_sorted(nrow, ncs, nchunk, cs) + start = bisect_left_ub(rbuflb, item1, cs, 0) + cs*nchunk + else: + start = ss + else: + start = 0 + # Now, for item2 + if item2 >= rbufrv[rvrow]: + if item2 < rbufrv[rvrow+1]: + if not bread: + # Get the bounds row from the LRU cache or read them. + rbufbc = self.get_lru_bounds(nrow, nbounds) + nchunk2 = bisect_right_ub(rbufbc, item2, nbounds, 0) + if nchunk2 <> nchunk: + # Get the sorted row from the LRU cache or read it. + rbuflb = self.get_lru_sorted(nrow, ncs, nchunk2, cs) + stop = bisect_right_ub(rbuflb, item2, cs, 0) + cs*nchunk2 + else: + stop = ss + else: + stop = 0 + length = stop - start + tlength = tlength + length + rbufst[nrow] = start + rbufln[nrow] = length + return tlength + + + # Optimized version for int16 + def _search_bin_na_s(self, long item1, long item2): + cdef int cs, ss, ncs, nrow, nrows, nbounds, rvrow + cdef int start, stop, tlength, length, bread, nchunk, nchunk2 + cdef int *rbufst + cdef int *rbufln + + # Variables with specific type + cdef npy_int16 *rbufrv + cdef npy_int16 *rbufbc = NULL + cdef npy_int16 *rbuflb = NULL + + cs = self.l_chunksize + ss = self.l_slicesize + ncs = ss // cs + nbounds = self.nbounds + nrows = self.nrows + rbufst = self.rbufst + rbufln = self.rbufln + rbufrv = self.rbufrv + tlength = 0 + for nrow from 0 <= nrow < nrows: + rvrow = nrow*2 + bread = 0 + nchunk = -1 + # Look if item1 is in this row + if item1 > rbufrv[rvrow]: + if item1 <= rbufrv[rvrow+1]: + # Get the bounds row from the LRU cache or read them. + rbufbc = self.get_lru_bounds(nrow, nbounds) + bread = 1 + nchunk = bisect_left_s(rbufbc, item1, nbounds, 0) + # Get the sorted row from the LRU cache or read it. + rbuflb = self.get_lru_sorted(nrow, ncs, nchunk, cs) + start = bisect_left_s(rbuflb, item1, cs, 0) + cs*nchunk + else: + start = ss + else: + start = 0 + # Now, for item2 + if item2 >= rbufrv[rvrow]: + if item2 < rbufrv[rvrow+1]: + if not bread: + # Get the bounds row from the LRU cache or read them. + rbufbc = self.get_lru_bounds(nrow, nbounds) + nchunk2 = bisect_right_s(rbufbc, item2, nbounds, 0) + if nchunk2 <> nchunk: + # Get the sorted row from the LRU cache or read it. + rbuflb = self.get_lru_sorted(nrow, ncs, nchunk2, cs) + stop = bisect_right_s(rbuflb, item2, cs, 0) + cs*nchunk2 + else: + stop = ss + else: + stop = 0 + length = stop - start + tlength = tlength + length + rbufst[nrow] = start + rbufln[nrow] = length + return tlength + + + # Optimized version for uint16 + def _search_bin_na_us(self, long item1, long item2): + cdef int cs, ss, ncs, nrow, nrows, nbounds, rvrow + cdef int start, stop, tlength, length, bread, nchunk, nchunk2 + cdef int *rbufst + cdef int *rbufln + + # Variables with specific type + cdef npy_uint16 *rbufrv + cdef npy_uint16 *rbufbc = NULL + cdef npy_uint16 *rbuflb = NULL + + cs = self.l_chunksize + ss = self.l_slicesize + ncs = ss // cs + nbounds = self.nbounds + nrows = self.nrows + rbufst = self.rbufst + rbufln = self.rbufln + rbufrv = self.rbufrv + tlength = 0 + for nrow from 0 <= nrow < nrows: + rvrow = nrow*2 + bread = 0 + nchunk = -1 + # Look if item1 is in this row + if item1 > rbufrv[rvrow]: + if item1 <= rbufrv[rvrow+1]: + # Get the bounds row from the LRU cache or read them. + rbufbc = self.get_lru_bounds(nrow, nbounds) + bread = 1 + nchunk = bisect_left_us(rbufbc, item1, nbounds, 0) + # Get the sorted row from the LRU cache or read it. + rbuflb = self.get_lru_sorted(nrow, ncs, nchunk, cs) + start = bisect_left_us(rbuflb, item1, cs, 0) + cs*nchunk + else: + start = ss + else: + start = 0 + # Now, for item2 + if item2 >= rbufrv[rvrow]: + if item2 < rbufrv[rvrow+1]: + if not bread: + # Get the bounds row from the LRU cache or read them. + rbufbc = self.get_lru_bounds(nrow, nbounds) + nchunk2 = bisect_right_us(rbufbc, item2, nbounds, 0) + if nchunk2 <> nchunk: + # Get the sorted row from the LRU cache or read it. + rbuflb = self.get_lru_sorted(nrow, ncs, nchunk2, cs) + stop = bisect_right_us(rbuflb, item2, cs, 0) + cs*nchunk2 + else: + stop = ss + else: + stop = 0 + length = stop - start + tlength = tlength + length + rbufst[nrow] = start + rbufln[nrow] = length + return tlength + + + # Optimized version for int32 + def _search_bin_na_i(self, long item1, long item2): + cdef int cs, ss, ncs, nrow, nrows, nbounds, rvrow + cdef int start, stop, tlength, length, bread, nchunk, nchunk2 + cdef int *rbufst + cdef int *rbufln + + # Variables with specific type + cdef npy_int32 *rbufrv + cdef npy_int32 *rbufbc = NULL + cdef npy_int32 *rbuflb = NULL + + cs = self.l_chunksize + ss = self.l_slicesize + ncs = ss // cs + nbounds = self.nbounds + nrows = self.nrows + rbufst = self.rbufst + rbufln = self.rbufln + rbufrv = self.rbufrv + tlength = 0 + for nrow from 0 <= nrow < nrows: + rvrow = nrow*2 + bread = 0 + nchunk = -1 + # Look if item1 is in this row + if item1 > rbufrv[rvrow]: + if item1 <= rbufrv[rvrow+1]: + # Get the bounds row from the LRU cache or read them. + rbufbc = self.get_lru_bounds(nrow, nbounds) + bread = 1 + nchunk = bisect_left_i(rbufbc, item1, nbounds, 0) + # Get the sorted row from the LRU cache or read it. + rbuflb = self.get_lru_sorted(nrow, ncs, nchunk, cs) + start = bisect_left_i(rbuflb, item1, cs, 0) + cs*nchunk + else: + start = ss + else: + start = 0 + # Now, for item2 + if item2 >= rbufrv[rvrow]: + if item2 < rbufrv[rvrow+1]: + if not bread: + # Get the bounds row from the LRU cache or read them. + rbufbc = self.get_lru_bounds(nrow, nbounds) + nchunk2 = bisect_right_i(rbufbc, item2, nbounds, 0) + if nchunk2 <> nchunk: + # Get the sorted row from the LRU cache or read it. + rbuflb = self.get_lru_sorted(nrow, ncs, nchunk2, cs) + stop = bisect_right_i(rbuflb, item2, cs, 0) + cs*nchunk2 + else: + stop = ss + else: + stop = 0 + length = stop - start + tlength = tlength + length + rbufst[nrow] = start + rbufln[nrow] = length + return tlength + + + # Optimized version for uint32 + def _search_bin_na_ui(self, npy_uint32 item1, npy_uint32 item2): + cdef int cs, ss, ncs, nrow, nrows, nbounds, rvrow + cdef int start, stop, tlength, length, bread, nchunk, nchunk2 + cdef int *rbufst + cdef int *rbufln + + # Variables with specific type + cdef npy_uint32 *rbufrv + cdef npy_uint32 *rbufbc = NULL + cdef npy_uint32 *rbuflb = NULL + + cs = self.l_chunksize + ss = self.l_slicesize + ncs = ss // cs + nbounds = self.nbounds + nrows = self.nrows + rbufst = self.rbufst + rbufln = self.rbufln + rbufrv = self.rbufrv + tlength = 0 + for nrow from 0 <= nrow < nrows: + rvrow = nrow*2 + bread = 0 + nchunk = -1 + # Look if item1 is in this row + if item1 > rbufrv[rvrow]: + if item1 <= rbufrv[rvrow+1]: + # Get the bounds row from the LRU cache or read them. + rbufbc = self.get_lru_bounds(nrow, nbounds) + bread = 1 + nchunk = bisect_left_ui(rbufbc, item1, nbounds, 0) + # Get the sorted row from the LRU cache or read it. + rbuflb = self.get_lru_sorted(nrow, ncs, nchunk, cs) + start = bisect_left_ui(rbuflb, item1, cs, 0) + cs*nchunk + else: + start = ss + else: + start = 0 + # Now, for item2 + if item2 >= rbufrv[rvrow]: + if item2 < rbufrv[rvrow+1]: + if not bread: + # Get the bounds row from the LRU cache or read them. + rbufbc = self.get_lru_bounds(nrow, nbounds) + nchunk2 = bisect_right_ui(rbufbc, item2, nbounds, 0) + if nchunk2 <> nchunk: + # Get the sorted row from the LRU cache or read it. + rbuflb = self.get_lru_sorted(nrow, ncs, nchunk2, cs) + stop = bisect_right_ui(rbuflb, item2, cs, 0) + cs*nchunk2 + else: + stop = ss + else: + stop = 0 + length = stop - start + tlength = tlength + length + rbufst[nrow] = start + rbufln[nrow] = length + return tlength + + + # Optimized version for int64 + def _search_bin_na_ll(self, npy_int64 item1, npy_int64 item2): + cdef int cs, ss, ncs, nrow, nrows, nbounds, rvrow + cdef int start, stop, tlength, length, bread, nchunk, nchunk2 + cdef int *rbufst + cdef int *rbufln + + # Variables with specific type + cdef npy_int64 *rbufrv + cdef npy_int64 *rbufbc = NULL + cdef npy_int64 *rbuflb = NULL + + cs = self.l_chunksize + ss = self.l_slicesize + ncs = ss // cs + nbounds = self.nbounds + nrows = self.nrows + rbufst = self.rbufst + rbufln = self.rbufln + rbufrv = self.rbufrv + tlength = 0 + for nrow from 0 <= nrow < nrows: + rvrow = nrow*2 + bread = 0 + nchunk = -1 + # Look if item1 is in this row + if item1 > rbufrv[rvrow]: + if item1 <= rbufrv[rvrow+1]: + # Get the bounds row from the LRU cache or read them. + rbufbc = self.get_lru_bounds(nrow, nbounds) + bread = 1 + nchunk = bisect_left_ll(rbufbc, item1, nbounds, 0) + # Get the sorted row from the LRU cache or read it. + rbuflb = self.get_lru_sorted(nrow, ncs, nchunk, cs) + start = bisect_left_ll(rbuflb, item1, cs, 0) + cs*nchunk + else: + start = ss + else: + start = 0 + # Now, for item2 + if item2 >= rbufrv[rvrow]: + if item2 < rbufrv[rvrow+1]: + if not bread: + # Get the bounds row from the LRU cache or read them. + rbufbc = self.get_lru_bounds(nrow, nbounds) + nchunk2 = bisect_right_ll(rbufbc, item2, nbounds, 0) + if nchunk2 <> nchunk: + # Get the sorted row from the LRU cache or read it. + rbuflb = self.get_lru_sorted(nrow, ncs, nchunk2, cs) + stop = bisect_right_ll(rbuflb, item2, cs, 0) + cs*nchunk2 + else: + stop = ss + else: + stop = 0 + length = stop - start + tlength = tlength + length + rbufst[nrow] = start + rbufln[nrow] = length + return tlength + + + # Optimized version for uint64 + def _search_bin_na_ull(self, npy_uint64 item1, npy_uint64 item2): + cdef int cs, ss, ncs, nrow, nrows, nbounds, rvrow + cdef int start, stop, tlength, length, bread, nchunk, nchunk2 + cdef int *rbufst + cdef int *rbufln + + # Variables with specific type + cdef npy_uint64 *rbufrv + cdef npy_uint64 *rbufbc = NULL + cdef npy_uint64 *rbuflb = NULL + + cs = self.l_chunksize + ss = self.l_slicesize + ncs = ss // cs + nbounds = self.nbounds + nrows = self.nrows + rbufst = self.rbufst + rbufln = self.rbufln + rbufrv = self.rbufrv + tlength = 0 + for nrow from 0 <= nrow < nrows: + rvrow = nrow*2 + bread = 0 + nchunk = -1 + # Look if item1 is in this row + if item1 > rbufrv[rvrow]: + if item1 <= rbufrv[rvrow+1]: + # Get the bounds row from the LRU cache or read them. + rbufbc = self.get_lru_bounds(nrow, nbounds) + bread = 1 + nchunk = bisect_left_ull(rbufbc, item1, nbounds, 0) + # Get the sorted row from the LRU cache or read it. + rbuflb = self.get_lru_sorted(nrow, ncs, nchunk, cs) + start = bisect_left_ull(rbuflb, item1, cs, 0) + cs*nchunk + else: + start = ss + else: + start = 0 + # Now, for item2 + if item2 >= rbufrv[rvrow]: + if item2 < rbufrv[rvrow+1]: + if not bread: + # Get the bounds row from the LRU cache or read them. + rbufbc = self.get_lru_bounds(nrow, nbounds) + nchunk2 = bisect_right_ull(rbufbc, item2, nbounds, 0) + if nchunk2 <> nchunk: + # Get the sorted row from the LRU cache or read it. + rbuflb = self.get_lru_sorted(nrow, ncs, nchunk2, cs) + stop = bisect_right_ull(rbuflb, item2, cs, 0) + cs*nchunk2 + else: + stop = ss + else: + stop = 0 + length = stop - start + tlength = tlength + length + rbufst[nrow] = start + rbufln[nrow] = length + return tlength + + + # Optimized version for float16 + def _search_bin_na_e(self, npy_float64 item1, npy_float64 item2): + cdef int cs, ss, ncs, nrow, nrows, nrow2, nbounds, rvrow + cdef int start, stop, tlength, length, bread, nchunk, nchunk2 + cdef int *rbufst + cdef int *rbufln + + # Variables with specific type + cdef npy_float16 *rbufrv + cdef npy_float16 *rbufbc = NULL + cdef npy_float16 *rbuflb = NULL + + cs = self.l_chunksize + ss = self.l_slicesize + ncs = ss // cs + nbounds = self.nbounds + nrows = self.nrows + tlength = 0 + rbufst = self.rbufst + rbufln = self.rbufln + # Limits not in cache, do a lookup + rbufrv = self.rbufrv + for nrow from 0 <= nrow < nrows: + rvrow = nrow*2 + bread = 0 + nchunk = -1 + + # Look if item1 is in this row + if item1 > rbufrv[rvrow]: + if item1 <= rbufrv[rvrow+1]: + # Get the bounds row from the LRU cache or read them. + rbufbc = self.get_lru_bounds(nrow, nbounds) + bread = 1 + nchunk = bisect_left_e(rbufbc, item1, nbounds, 0) + # Get the sorted row from the LRU cache or read it. + rbuflb = self.get_lru_sorted(nrow, ncs, nchunk, cs) + start = bisect_left_e(rbuflb, item1, cs, 0) + cs*nchunk + else: + start = ss + else: + start = 0 + # Now, for item2 + if item2 >= rbufrv[rvrow]: + if item2 < rbufrv[rvrow+1]: + if not bread: + # Get the bounds row from the LRU cache or read them. + rbufbc = self.get_lru_bounds(nrow, nbounds) + nchunk2 = bisect_right_e(rbufbc, item2, nbounds, 0) + if nchunk2 <> nchunk: + # Get the sorted row from the LRU cache or read it. + rbuflb = self.get_lru_sorted(nrow, ncs, nchunk2, cs) + stop = bisect_right_e(rbuflb, item2, cs, 0) + cs*nchunk2 + else: + stop = ss + else: + stop = 0 + length = stop - start + tlength = tlength + length + rbufst[nrow] = start + rbufln[nrow] = length + return tlength + + + # Optimized version for float32 + def _search_bin_na_f(self, npy_float64 item1, npy_float64 item2): + cdef int cs, ss, ncs, nrow, nrows, nrow2, nbounds, rvrow + cdef int start, stop, tlength, length, bread, nchunk, nchunk2 + cdef int *rbufst + cdef int *rbufln + # Variables with specific type + cdef npy_float32 *rbufrv + cdef npy_float32 *rbufbc = NULL + cdef npy_float32 *rbuflb = NULL + + cs = self.l_chunksize + ss = self.l_slicesize + ncs = ss // cs + nbounds = self.nbounds + nrows = self.nrows + tlength = 0 + rbufst = self.rbufst + rbufln = self.rbufln + + # Limits not in cache, do a lookup + rbufrv = self.rbufrv + for nrow from 0 <= nrow < nrows: + rvrow = nrow*2 + bread = 0 + nchunk = -1 + # Look if item1 is in this row + if item1 > rbufrv[rvrow]: + if item1 <= rbufrv[rvrow+1]: + # Get the bounds row from the LRU cache or read them. + rbufbc = self.get_lru_bounds(nrow, nbounds) + bread = 1 + nchunk = bisect_left_f(rbufbc, item1, nbounds, 0) + # Get the sorted row from the LRU cache or read it. + rbuflb = self.get_lru_sorted(nrow, ncs, nchunk, cs) + start = bisect_left_f(rbuflb, item1, cs, 0) + cs*nchunk + else: + start = ss + else: + start = 0 + # Now, for item2 + if item2 >= rbufrv[rvrow]: + if item2 < rbufrv[rvrow+1]: + if not bread: + # Get the bounds row from the LRU cache or read them. + rbufbc = self.get_lru_bounds(nrow, nbounds) + nchunk2 = bisect_right_f(rbufbc, item2, nbounds, 0) + if nchunk2 <> nchunk: + # Get the sorted row from the LRU cache or read it. + rbuflb = self.get_lru_sorted(nrow, ncs, nchunk2, cs) + stop = bisect_right_f(rbuflb, item2, cs, 0) + cs*nchunk2 + else: + stop = ss + else: + stop = 0 + length = stop - start + tlength = tlength + length + rbufst[nrow] = start + rbufln[nrow] = length + return tlength + + + # Optimized version for float64 + def _search_bin_na_d(self, npy_float64 item1, npy_float64 item2): + cdef int cs, ss, ncs, nrow, nrows, nrow2, nbounds, rvrow + cdef int start, stop, tlength, length, bread, nchunk, nchunk2 + cdef int *rbufst + cdef int *rbufln + + # Variables with specific type + cdef npy_float64 *rbufrv + cdef npy_float64 *rbufbc = NULL + cdef npy_float64 *rbuflb = NULL + + cs = self.l_chunksize + ss = self.l_slicesize + ncs = ss // cs + nbounds = self.nbounds + nrows = self.nrows + tlength = 0 + rbufst = self.rbufst + rbufln = self.rbufln + + # Limits not in cache, do a lookup + rbufrv = self.rbufrv + for nrow from 0 <= nrow < nrows: + rvrow = nrow*2 + bread = 0 + nchunk = -1 + + # Look if item1 is in this row + if item1 > rbufrv[rvrow]: + if item1 <= rbufrv[rvrow+1]: + # Get the bounds row from the LRU cache or read them. + rbufbc = self.get_lru_bounds(nrow, nbounds) + bread = 1 + nchunk = bisect_left_d(rbufbc, item1, nbounds, 0) + # Get the sorted row from the LRU cache or read it. + rbuflb = self.get_lru_sorted(nrow, ncs, nchunk, cs) + start = bisect_left_d(rbuflb, item1, cs, 0) + cs*nchunk + else: + start = ss + else: + start = 0 + # Now, for item2 + if item2 >= rbufrv[rvrow]: + if item2 < rbufrv[rvrow+1]: + if not bread: + # Get the bounds row from the LRU cache or read them. + rbufbc = self.get_lru_bounds(nrow, nbounds) + nchunk2 = bisect_right_d(rbufbc, item2, nbounds, 0) + if nchunk2 <> nchunk: + # Get the sorted row from the LRU cache or read it. + rbuflb = self.get_lru_sorted(nrow, ncs, nchunk2, cs) + stop = bisect_right_d(rbuflb, item2, cs, 0) + cs*nchunk2 + else: + stop = ss + else: + stop = 0 + length = stop - start + tlength = tlength + length + rbufst[nrow] = start + rbufln[nrow] = length + return tlength + + + # Optimized version for npy_longdouble/float96/float128 + def _search_bin_na_g(self, npy_longdouble item1, npy_longdouble item2): + cdef int cs, ss, ncs, nrow, nrows, nrow2, nbounds, rvrow + cdef int start, stop, tlength, length, bread, nchunk, nchunk2 + cdef int *rbufst + cdef int *rbufln + + # Variables with specific type + cdef npy_longdouble *rbufrv + cdef npy_longdouble *rbufbc = NULL + cdef npy_longdouble *rbuflb = NULL + + cs = self.l_chunksize + ss = self.l_slicesize + ncs = ss // cs + nbounds = self.nbounds + nrows = self.nrows + tlength = 0 + rbufst = self.rbufst + rbufln = self.rbufln + + # Limits not in cache, do a lookup + rbufrv = self.rbufrv + for nrow from 0 <= nrow < nrows: + rvrow = nrow*2 + bread = 0 + nchunk = -1 + + # Look if item1 is in this row + if item1 > rbufrv[rvrow]: + if item1 <= rbufrv[rvrow+1]: + # Get the bounds row from the LRU cache or read them. + rbufbc = self.get_lru_bounds(nrow, nbounds) + bread = 1 + nchunk = bisect_left_g(rbufbc, item1, nbounds, 0) + # Get the sorted row from the LRU cache or read it. + rbuflb = self.get_lru_sorted(nrow, ncs, nchunk, cs) + start = bisect_left_g(rbuflb, item1, cs, 0) + cs*nchunk + else: + start = ss + else: + start = 0 + # Now, for item2 + if item2 >= rbufrv[rvrow]: + if item2 < rbufrv[rvrow+1]: + if not bread: + # Get the bounds row from the LRU cache or read them. + rbufbc = self.get_lru_bounds(nrow, nbounds) + nchunk2 = bisect_right_g(rbufbc, item2, nbounds, 0) + if nchunk2 <> nchunk: + # Get the sorted row from the LRU cache or read it. + rbuflb = self.get_lru_sorted(nrow, ncs, nchunk2, cs) + stop = bisect_right_g(rbuflb, item2, cs, 0) + cs*nchunk2 + else: + stop = ss + else: + stop = 0 + length = stop - start + tlength = tlength + length + rbufst[nrow] = start + rbufln[nrow] = length + return tlength + + + def _g_close(self): + super()._g_close() + # Release specific resources of this class + if self.mem_space_id > 0: + H5Sclose(self.mem_space_id) + + +cdef class LastRowArray(Array): + """ + Container for keeping sorted and indices values of last rows of an index. + """ + + def _read_index_slice(self, hsize_t start, hsize_t stop, ndarray idx): + """Read the reverse index part of an LR index.""" + + cdef void *buf = PyArray_DATA(idx) + with nogil: + ret = H5ARRAYOreadSliceLR(self.dataset_id, self.type_id, + start, stop, buf) + + if ret < 0: + raise HDF5ExtError("Problems reading the index data in Last Row.") + + + def _read_sorted_slice(self, IndexArray sorted, hsize_t start, hsize_t stop): + """Read the sorted part of an LR index.""" + + cdef void *rbuflb + + rbuflb = sorted.rbuflb # direct access to rbuflb: very fast. + with nogil: + ret = H5ARRAYOreadSliceLR(self.dataset_id, self.type_id, + start, stop, rbuflb) + + if ret < 0: + raise HDF5ExtError("Problems reading the index data.") + return sorted.bufferlb[:stop-start] + + + +## Local Variables: +## mode: python +## py-indent-offset: 2 +## tab-width: 2 +## fill-column: 78 +## End: diff --git a/tables/leaf.py b/tables/leaf.py new file mode 100644 index 0000000..d60efd0 --- /dev/null +++ b/tables/leaf.py @@ -0,0 +1,740 @@ +"""Here is defined the Leaf class.""" + +import warnings +import math + +import numpy as np + +from .flavor import (check_flavor, internal_flavor, toarray, + alias_map as flavor_alias_map) +from .node import Node +from .filters import Filters +from .utils import byteorders, lazyattr, SizeType +from .exceptions import PerformanceWarning + + +def csformula(expected_mb): + """Return the fitted chunksize for expected_mb.""" + + # For a basesize of 8 KB, this will return: + # 8 KB for datasets <= 1 MB + # 1 MB for datasets >= 10 TB + basesize = 8 * 1024 # 8 KB is a good minimum + return basesize * int(2**math.log10(expected_mb)) + + +def limit_es(expected_mb): + """Protection against creating too small or too large chunks.""" + + if expected_mb < 1: # < 1 MB + expected_mb = 1 + elif expected_mb > 10**7: # > 10 TB + expected_mb = 10**7 + return expected_mb + + +def calc_chunksize(expected_mb): + """Compute the optimum HDF5 chunksize for I/O purposes. + + Rational: HDF5 takes the data in bunches of chunksize length to + write the on disk. A BTree in memory is used to map structures on + disk. The more chunks that are allocated for a dataset the larger + the B-tree. Large B-trees take memory and causes file storage + overhead as well as more disk I/O and higher contention for the meta + data cache. You have to balance between memory and I/O overhead + (small B-trees) and time to access to data (big B-trees). + + The tuning of the chunksize parameter affects the performance and + the memory consumed. This is based on my own experiments and, as + always, your mileage may vary. + + """ + + expected_mb = limit_es(expected_mb) + zone = int(math.log10(expected_mb)) + expected_mb = 10**zone + chunksize = csformula(expected_mb) + # XXX: Multiply by 8 seems optimal for sequential access + return chunksize * 8 + + +class Leaf(Node): + """Abstract base class for all PyTables leaves. + + A leaf is a node (see the Node class in :class:`Node`) which hangs from a + group (see the Group class in :class:`Group`) but, unlike a group, it can + not have any further children below it (i.e. it is an end node). + + This definition includes all nodes which contain actual data (datasets + handled by the Table - see :ref:`TableClassDescr`, Array - + see :ref:`ArrayClassDescr`, CArray - see :ref:`CArrayClassDescr`, EArray - + see :ref:`EArrayClassDescr`, and VLArray - see :ref:`VLArrayClassDescr` + classes) and unsupported nodes (the UnImplemented + class - :ref:`UnImplementedClassDescr`) these classes do in fact inherit + from Leaf. + + + .. rubric:: Leaf attributes + + These instance variables are provided in addition to those in Node + (see :ref:`NodeClassDescr`): + + .. attribute:: byteorder + + The byte ordering of the leaf data *on disk*. It will be either + ``little`` or ``big``. + + .. attribute:: dtype + + The NumPy dtype that most closely matches this leaf type. + + .. attribute:: extdim + + The index of the enlargeable dimension (-1 if none). + + .. attribute:: nrows + + The length of the main dimension of the leaf data. + + .. attribute:: nrowsinbuf + + The number of rows that fit in internal input buffers. + + You can change this to fine-tune the speed or memory + requirements of your application. + + .. attribute:: shape + + The shape of data in the leaf. + + """ + + # These are a little hard to override, but so are properties. + attrs = Node._v_attrs + """The associated AttributeSet instance - see :ref:`AttributeSetClassDescr` + (This is an easier-to-write alias of :attr:`Node._v_attrs`.""" + title = Node._v_title + """A description for this node + (This is an easier-to-write alias of :attr:`Node._v_title`).""" + + @property + def name(self): + """The name of this node in its parent group (This is an + easier-to-write alias of :attr:`Node._v_name`).""" + return self._v_name + + @property + def chunkshape(self): + """The HDF5 chunk size for chunked leaves (a tuple). + + This is read-only because you cannot change the chunk size of a + leaf once it has been created. + """ + return getattr(self, '_v_chunkshape', None) + + @property + def object_id(self): + """A node identifier, which may change from run to run. + (This is an easier-to-write alias of :attr:`Node._v_objectid`). + + .. versionchanged:: 3.0 + The *objectID* property has been renamed into *object_id*. + + """ + return self._v_objectid + + @property + def ndim(self): + """The number of dimensions of the leaf data. + + .. versionadded: 2.4""" + return len(self.shape) + + @lazyattr + def filters(self): + """Filter properties for this leaf. + + See Also + -------- + Filters + + """ + + return Filters._from_leaf(self) + + @property + def track_times(self): + """Whether timestamps for the leaf are recorded + + If the leaf is not a dataset, this will fail with HDF5ExtError. + + The track times dataset creation property does not seem to + survive closing and reopening as of HDF5 1.8.17. Currently, + it may be more accurate to test whether the ctime for the + dataset is 0: + track_times = (leaf._get_obj_timestamps().ctime == 0) + """ + return self._get_obj_track_times() + + @property + def maindim(self): + """The dimension along which iterators work. + + Its value is 0 (i.e. the first dimension) when the dataset is not + extendable, and self.extdim (where available) for extendable ones. + """ + + if self.extdim < 0: + return 0 # choose the first dimension + return self.extdim + + @property + def flavor(self): + """The type of data object read from this leaf. + + It can be any of 'numpy' or 'python'. + + You can (and are encouraged to) use this property to get, set + and delete the FLAVOR HDF5 attribute of the leaf. When the leaf + has no such attribute, the default flavor is used.. + """ + + return self._flavor + + @flavor.setter + def flavor(self, flavor): + self._v_file._check_writable() + check_flavor(flavor) + self._v_attrs.FLAVOR = self._flavor = flavor # logs the change + + @flavor.deleter + def flavor(self): + del self._v_attrs.FLAVOR + self._flavor = internal_flavor + + @property + def size_on_disk(self): + """ + The size of this leaf's data in bytes as it is stored on disk. If the + data is compressed, this shows the compressed size. In the case of + uncompressed, chunked data, this may be slightly larger than the amount + of data, due to partially filled chunks. + """ + return self._get_storage_size() + + def __init__(self, parentnode, name, + new=False, filters=None, + byteorder=None, _log=True, + track_times=True): + self._v_new = new + """Is this the first time the node has been created?""" + self.nrowsinbuf = None + """ + The number of rows that fits in internal input buffers. + + You can change this to fine-tune the speed or memory + requirements of your application. + """ + self._flavor = None + """Private storage for the `flavor` property.""" + + if new: + # Get filter properties from parent group if not given. + if filters is None: + filters = parentnode._v_filters + self.__dict__['filters'] = filters # bypass the property + + if byteorder not in (None, 'little', 'big'): + raise ValueError( + "the byteorder can only take 'little' or 'big' values " + "and you passed: %s" % byteorder) + self.byteorder = byteorder + """The byte ordering of the leaf data *on disk*.""" + + self._want_track_times = track_times + + # Existing filters need not be read since `filters` + # is a lazy property that automatically handles their loading. + + super().__init__(parentnode, name, _log) + + def __len__(self): + """Return the length of the main dimension of the leaf data. + + Please note that this may raise an OverflowError on 32-bit platforms + for datasets having more than 2**31-1 rows. This is a limitation of + Python that you can work around by using the nrows or shape attributes. + + """ + + return self.nrows + + def __str__(self): + """The string representation for this object is its pathname in the + HDF5 object tree plus some additional metainfo.""" + + filters = [] + if self.filters.fletcher32: + filters.append("fletcher32") + if self.filters.complevel: + if self.filters.shuffle: + filters.append("shuffle") + if self.filters.bitshuffle: + filters.append("bitshuffle") + filters.append(f"{self.filters.complib}({self.filters.complevel})") + return (f"{self._v_pathname} ({self.__class__.__name__}" + f"{self.shape}{', '.join(filters)}) {self._v_title!r}") + + def _g_post_init_hook(self): + """Code to be run after node creation and before creation logging. + + This method gets or sets the flavor of the leaf. + + """ + + super()._g_post_init_hook() + if self._v_new: # set flavor of new node + if self._flavor is None: + self._flavor = internal_flavor + else: # flavor set at creation time, do not log + if self._v_file.params['PYTABLES_SYS_ATTRS']: + self._v_attrs._g__setattr('FLAVOR', self._flavor) + else: # get flavor of existing node (if any) + if self._v_file.params['PYTABLES_SYS_ATTRS']: + flavor = getattr(self._v_attrs, 'FLAVOR', internal_flavor) + self._flavor = flavor_alias_map.get(flavor, flavor) + else: + self._flavor = internal_flavor + + def _calc_chunkshape(self, expectedrows, rowsize, itemsize): + """Calculate the shape for the HDF5 chunk.""" + + # In case of a scalar shape, return the unit chunksize + if self.shape == (): + return (SizeType(1),) + + # Compute the chunksize + MB = 1024 * 1024 + expected_mb = (expectedrows * rowsize) // MB + chunksize = calc_chunksize(expected_mb) + + maindim = self.maindim + # Compute the chunknitems + chunknitems = chunksize // itemsize + # Safeguard against itemsizes being extremely large + if chunknitems == 0: + chunknitems = 1 + chunkshape = list(self.shape) + # Check whether trimming the main dimension is enough + chunkshape[maindim] = 1 + newchunknitems = np.prod(chunkshape, dtype=SizeType) + if newchunknitems <= chunknitems: + chunkshape[maindim] = chunknitems // newchunknitems + else: + # No, so start trimming other dimensions as well + for j in range(len(chunkshape)): + # Check whether trimming this dimension is enough + chunkshape[j] = 1 + newchunknitems = np.prod(chunkshape, dtype=SizeType) + if newchunknitems <= chunknitems: + chunkshape[j] = chunknitems // newchunknitems + break + else: + # Ops, we ran out of the loop without a break + # Set the last dimension to chunknitems + chunkshape[-1] = chunknitems + + return tuple(SizeType(s) for s in chunkshape) + + def _calc_nrowsinbuf(self): + """Calculate the number of rows that fits on a PyTables buffer.""" + + params = self._v_file.params + # Compute the nrowsinbuf + rowsize = self.rowsize + buffersize = params['IO_BUFFER_SIZE'] + if rowsize != 0: + nrowsinbuf = buffersize // rowsize + else: + nrowsinbuf = 1 + + # Safeguard against row sizes being extremely large + if nrowsinbuf == 0: + nrowsinbuf = 1 + # If rowsize is too large, issue a Performance warning + maxrowsize = params['BUFFER_TIMES'] * buffersize + if rowsize > maxrowsize: + warnings.warn("""\ +The Leaf ``%s`` is exceeding the maximum recommended rowsize (%d bytes); +be ready to see PyTables asking for *lots* of memory and possibly slow +I/O. You may want to reduce the rowsize by trimming the value of +dimensions that are orthogonal (and preferably close) to the *main* +dimension of this leave. Alternatively, in case you have specified a +very small/large chunksize, you may want to increase/decrease it.""" + % (self._v_pathname, maxrowsize), + PerformanceWarning) + return nrowsinbuf + + # This method is appropriate for calls to __getitem__ methods + def _process_range(self, start, stop, step, dim=None, warn_negstep=True): + if dim is None: + nrows = self.nrows # self.shape[self.maindim] + else: + nrows = self.shape[dim] + + if warn_negstep and step and step < 0: + raise ValueError("slice step cannot be negative") + + # if start is not None: start = long(start) + # if stop is not None: stop = long(stop) + # if step is not None: step = long(step) + + return slice(start, stop, step).indices(int(nrows)) + + # This method is appropriate for calls to read() methods + def _process_range_read(self, start, stop, step, warn_negstep=True): + nrows = self.nrows + if start is not None and stop is None and step is None: + # Protection against start greater than available records + # nrows == 0 is a special case for empty objects + if 0 < nrows <= start: + raise IndexError("start of range (%s) is greater than " + "number of rows (%s)" % (start, nrows)) + step = 1 + if start == -1: # corner case + stop = nrows + else: + stop = start + 1 + # Finally, get the correct values (over the main dimension) + start, stop, step = self._process_range(start, stop, step, + warn_negstep=warn_negstep) + return (start, stop, step) + + def _g_copy(self, newparent, newname, recursive, _log=True, **kwargs): + # Compute default arguments. + start = kwargs.pop('start', None) + stop = kwargs.pop('stop', None) + step = kwargs.pop('step', None) + title = kwargs.pop('title', self._v_title) + filters = kwargs.pop('filters', self.filters) + chunkshape = kwargs.pop('chunkshape', self.chunkshape) + copyuserattrs = kwargs.pop('copyuserattrs', True) + stats = kwargs.pop('stats', None) + if chunkshape == 'keep': + chunkshape = self.chunkshape # Keep the original chunkshape + elif chunkshape == 'auto': + chunkshape = None # Will recompute chunkshape + + # Fix arguments with explicit None values for backwards compatibility. + if title is None: + title = self._v_title + if filters is None: + filters = self.filters + + # Create a copy of the object. + (new_node, bytes) = self._g_copy_with_stats( + newparent, newname, start, stop, step, + title, filters, chunkshape, _log, **kwargs) + + # Copy user attributes if requested (or the flavor at least). + if copyuserattrs: + self._v_attrs._g_copy(new_node._v_attrs, copyclass=True) + elif 'FLAVOR' in self._v_attrs: + if self._v_file.params['PYTABLES_SYS_ATTRS']: + new_node._v_attrs._g__setattr('FLAVOR', self._flavor) + new_node._flavor = self._flavor # update cached value + + # Update statistics if needed. + if stats is not None: + stats['leaves'] += 1 + stats['bytes'] += bytes + + return new_node + + def _g_fix_byteorder_data(self, data, dbyteorder): + """Fix the byteorder of data passed in constructors.""" + dbyteorder = byteorders[dbyteorder] + # If self.byteorder has not been passed as an argument of + # the constructor, then set it to the same value of data. + if self.byteorder is None: + self.byteorder = dbyteorder + # Do an additional in-place byteswap of data if the in-memory + # byteorder doesn't match that of the on-disk. This is the only + # place that we have to do the conversion manually. In all the + # other cases, it will be HDF5 the responsible of doing the + # byteswap properly. + if dbyteorder in ['little', 'big']: + if dbyteorder != self.byteorder: + # if data is not writeable, do a copy first + if not data.flags.writeable: + data = data.copy() + data.byteswap(True) + else: + # Fix the byteorder again, no matter which byteorder have + # specified the user in the constructor. + self.byteorder = "irrelevant" + return data + + def _point_selection(self, key): + """Perform a point-wise selection. + + `key` can be any of the following items: + + * A boolean array with the same shape than self. Those positions + with True values will signal the coordinates to be returned. + + * A numpy array (or list or tuple) with the point coordinates. + This has to be a two-dimensional array of size len(self.shape) + by num_elements containing a list of of zero-based values + specifying the coordinates in the dataset of the selected + elements. The order of the element coordinates in the array + specifies the order in which the array elements are iterated + through when I/O is performed. Duplicate coordinate locations + are not checked for. + + Return the coordinates array. If this is not possible, raise a + `TypeError` so that the next selection method can be tried out. + + This is useful for whatever `Leaf` instance implementing a + point-wise selection. + + """ + input_key = key + if type(key) in (list, tuple): + if isinstance(key, tuple) and len(key) > len(self.shape): + raise IndexError(f"Invalid index or slice: {key!r}") + # Try to convert key to a numpy array. If not possible, + # a TypeError will be issued (to be catched later on). + try: + key = toarray(key) + except ValueError: + raise TypeError(f"Invalid index or slice: {key!r}") + elif not isinstance(key, np.ndarray): + raise TypeError(f"Invalid index or slice: {key!r}") + + # Protection against empty keys + if len(key) == 0: + return np.array([], dtype="i8") + + if key.dtype.kind == 'b': + if not key.shape == self.shape: + raise IndexError( + "Boolean indexing array has incompatible shape") + # Get the True coordinates (64-bit indices!) + coords = np.asarray(key.nonzero(), dtype='i8') + coords = np.transpose(coords) + elif key.dtype.kind == 'i' or key.dtype.kind == 'u': + if len(key.shape) > 2: + raise IndexError( + "Coordinate indexing array has incompatible shape") + elif len(key.shape) == 2: + if key.shape[0] != len(self.shape): + raise IndexError( + "Coordinate indexing array has incompatible shape") + coords = np.asarray(key, dtype="i8") + coords = np.transpose(coords) + else: + # For 1-dimensional datasets + coords = np.asarray(key, dtype="i8") + + # handle negative indices + base = coords if coords.base is None else coords.base + if base is input_key: + # never modify the original "key" data + coords = coords.copy() + + idx = coords < 0 + coords[idx] = (coords + self.shape)[idx] + + # bounds check + if np.any(coords < 0) or np.any(coords >= self.shape): + raise IndexError("Index out of bounds") + else: + raise TypeError("Only integer coordinates allowed.") + # We absolutely need a contiguous array + if not coords.flags.contiguous: + coords = coords.copy() + return coords + + # Tree manipulation + def remove(self): + """Remove this node from the hierarchy. + + This method has the behavior described + in :meth:`Node._f_remove`. Please note that there is no recursive flag + since leaves do not have child nodes. + + """ + + self._f_remove(False) + + def rename(self, newname): + """Rename this node in place. + + This method has the behavior described in :meth:`Node._f_rename()`. + + """ + + self._f_rename(newname) + + def move(self, newparent=None, newname=None, + overwrite=False, createparents=False): + """Move or rename this node. + + This method has the behavior described in :meth:`Node._f_move` + + """ + + self._f_move(newparent, newname, overwrite, createparents) + + def copy(self, newparent=None, newname=None, + overwrite=False, createparents=False, **kwargs): + """Copy this node and return the new one. + + This method has the behavior described in :meth:`Node._f_copy`. Please + note that there is no recursive flag since leaves do not have child + nodes. + + .. warning:: + + Note that unknown parameters passed to this method will be + ignored, so may want to double check the spelling of these + (i.e. if you write them incorrectly, they will most probably + be ignored). + + Parameters + ---------- + title + The new title for the destination. If omitted or None, the original + title is used. + filters : Filters + Specifying this parameter overrides the original filter properties + in the source node. If specified, it must be an instance of the + Filters class (see :ref:`FiltersClassDescr`). The default is to + copy the filter properties from the source node. + copyuserattrs + You can prevent the user attributes from being copied by setting + this parameter to False. The default is to copy them. + start, stop, step : int + Specify the range of rows to be copied; the default is to copy all + the rows. + stats + This argument may be used to collect statistics on the copy + process. When used, it should be a dictionary with keys 'groups', + 'leaves' and 'bytes' having a numeric value. Their values will be + incremented to reflect the number of groups, leaves and bytes, + respectively, that have been copied during the operation. + chunkshape + The chunkshape of the new leaf. It supports a couple of special + values. A value of keep means that the chunkshape will be the same + than original leaf (this is the default). A value of auto means + that a new shape will be computed automatically in order to ensure + best performance when accessing the dataset through the main + dimension. Any other value should be an integer or a tuple + matching the dimensions of the leaf. + + """ + + return self._f_copy( + newparent, newname, overwrite, createparents, **kwargs) + + def truncate(self, size): + """Truncate the main dimension to be size rows. + + If the main dimension previously was larger than this size, the extra + data is lost. If the main dimension previously was shorter, it is + extended, and the extended part is filled with the default values. + + The truncation operation can only be applied to *enlargeable* datasets, + else a TypeError will be raised. + + """ + + # A non-enlargeable arrays (Array, CArray) cannot be truncated + if self.extdim < 0: + raise TypeError("non-enlargeable datasets cannot be truncated") + self._g_truncate(size) + + def isvisible(self): + """Is this node visible? + + This method has the behavior described in :meth:`Node._f_isvisible()`. + + """ + + return self._f_isvisible() + + # Attribute handling + def get_attr(self, name): + """Get a PyTables attribute from this node. + + This method has the behavior described in :meth:`Node._f_getattr`. + + """ + + return self._f_getattr(name) + + def set_attr(self, name, value): + """Set a PyTables attribute for this node. + + This method has the behavior described in :meth:`Node._f_setattr()`. + + """ + + self._f_setattr(name, value) + + def del_attr(self, name): + """Delete a PyTables attribute from this node. + + This method has the behavior described in :meth:`Node_f_delAttr`. + + """ + + self._f_delattr(name) + + # Data handling + def flush(self): + """Flush pending data to disk. + + Saves whatever remaining buffered data to disk. It also releases + I/O buffers, so if you are filling many datasets in the same + PyTables session, please call flush() extensively so as to help + PyTables to keep memory requirements low. + + """ + + self._g_flush() + + def _f_close(self, flush=True): + """Close this node in the tree. + + This method has the behavior described in :meth:`Node._f_close`. + Besides that, the optional argument flush tells whether to flush + pending data to disk or not before closing. + + """ + + if not self._v_isopen: + return # the node is already closed or not initialized + + # Only do a flush in case the leaf has an IO buffer. The + # internal buffers of HDF5 will be flushed afterwards during the + # self._g_close() call. Avoiding an unnecessary flush() + # operation accelerates the closing for the unbuffered leaves. + if flush and hasattr(self, "_v_iobuf"): + self.flush() + + # Close the dataset and release resources + self._g_close() + + # Close myself as a node. + super()._f_close() + + def close(self, flush=True): + """Close this node in the tree. + + This method is completely equivalent to :meth:`Leaf._f_close`. + + """ + + self._f_close(flush) diff --git a/tables/link.py b/tables/link.py new file mode 100644 index 0000000..662ecb2 --- /dev/null +++ b/tables/link.py @@ -0,0 +1,413 @@ +"""Create links in the HDF5 file. + +This module implements containers for soft and external links. Hard +links doesn't need a container as such as they are the same as regular +nodes (groups or leaves). + +Classes: + + SoftLink + ExternalLink + +Functions: + +Misc variables: + +""" + +from pathlib import Path + +import tables as tb + +from . import linkextension +from .node import Node +from .utils import lazyattr +from .attributeset import AttributeSet + + +def _g_get_link_class(parent_id, name): + """Guess the link class.""" + + return linkextension._get_link_class(parent_id, name) + + +class Link(Node): + """Abstract base class for all PyTables links. + + A link is a node that refers to another node. The Link class inherits from + Node class and the links that inherits from Link are SoftLink and + ExternalLink. There is not a HardLink subclass because hard links behave + like a regular Group or Leaf. Contrarily to other nodes, links cannot have + HDF5 attributes. This is an HDF5 library limitation that might be solved + in future releases. + + See :ref:`LinksTutorial` for a small tutorial on how to work with links. + + .. rubric:: Link attributes + + .. attribute:: target + + The path string to the pointed node. + + """ + + # Properties + @lazyattr + def _v_attrs(self): + """ + A *NoAttrs* instance replacing the typical *AttributeSet* instance of + other node objects. The purpose of *NoAttrs* is to make clear that + HDF5 attributes are not supported in link nodes. + """ + class NoAttrs(AttributeSet): + def __getattr__(self, name): + raise KeyError("you cannot get attributes from this " + "`%s` instance" % self.__class__.__name__) + + def __setattr__(self, name, value): + raise KeyError("you cannot set attributes to this " + "`%s` instance" % self.__class__.__name__) + + def _g_close(self): + pass + return NoAttrs(self) + + def __init__(self, parentnode, name, target=None, _log=False): + self._v_new = target is not None + self.target = target + """The path string to the pointed node.""" + + super().__init__(parentnode, name, _log) + + # Public and tailored versions for copy, move, rename and remove methods + def copy(self, newparent=None, newname=None, + overwrite=False, createparents=False): + """Copy this link and return the new one. + + See :meth:`Node._f_copy` for a complete explanation of the arguments. + Please note that there is no recursive flag since links do not have + child nodes. + + """ + + newnode = self._f_copy(newparent=newparent, newname=newname, + overwrite=overwrite, + createparents=createparents) + # Insert references to a `newnode` via `newname` + newnode._v_parent._g_refnode(newnode, newname, True) + return newnode + + def move(self, newparent=None, newname=None, overwrite=False): + """Move or rename this link. + + See :meth:`Node._f_move` for a complete explanation of the arguments. + + """ + + return self._f_move(newparent=newparent, newname=newname, + overwrite=overwrite) + + def remove(self): + """Remove this link from the hierarchy.""" + + return self._f_remove() + + def rename(self, newname=None, overwrite=False): + """Rename this link in place. + + See :meth:`Node._f_rename` for a complete explanation of the arguments. + + """ + + return self._f_rename(newname=newname, overwrite=overwrite) + + def __repr__(self): + return str(self) + + +class SoftLink(linkextension.SoftLink, Link): + """Represents a soft link (aka symbolic link). + + A soft link is a reference to another node in the *same* file hierarchy. + Provided that the target node exists, its attributes and methods can be + accessed directly from the softlink using the normal `.` syntax. + + Softlinks also have the following public methods/attributes: + + * `target` + * `dereference()` + * `copy()` + * `move()` + * `remove()` + * `rename()` + * `is_dangling()` + + Note that these will override any correspondingly named methods/attributes + of the target node. + + For backwards compatibility, it is also possible to obtain the target node + via the `__call__()` special method (this action is called *dereferencing*; + see below) + + Examples + -------- + + :: + + >>> import numpy as np + >>> f = tb.open_file('/tmp/test_softlink.h5', 'w') + >>> a = f.create_array('/', 'A', np.arange(10)) + >>> link_a = f.create_soft_link('/', 'link_A', target='/A') + + # transparent read/write access to a softlinked node + >>> link_a[0] = -1 + >>> link_a[:], link_a.dtype + (array([-1, 1, 2, 3, 4, 5, 6, 7, 8, 9]), dtype('int64')) + + # dereferencing a softlink using the __call__() method + >>> link_a() is a + True + + # SoftLink.remove() overrides Array.remove() + >>> link_a.remove() + >>> print(link_a) + + >>> a[:], a.dtype + (array([-1, 1, 2, 3, 4, 5, 6, 7, 8, 9]), dtype('int64')) + >>> f.close() + + + """ + + # Class identifier. + _c_classid = 'SOFTLINK' + + # attributes with these names/prefixes are treated as attributes of the + # SoftLink rather than the target node + _link_attrnames = ('target', 'dereference', 'is_dangling', 'copy', 'move', + 'remove', 'rename', '__init__', '__str__', '__repr__', + '__unicode__', '__class__', '__dict__') + _link_attrprefixes = ('_f_', '_c_', '_g_', '_v_') + + def __call__(self): + """Dereference `self.target` and return the object. + + Examples + -------- + + :: + + >>> f = tb.open_file('tables/tests/slink.h5') + >>> f.root.arr2 + /arr2 (SoftLink) -> /arr + >>> print(f.root.arr2()) + /arr (Array(2,)) '' + >>> f.close() + + """ + return self.dereference() + + def dereference(self): + + if self._v_isopen: + target = self.target + # Check for relative pathnames + if not self.target.startswith('/'): + target = self._v_parent._g_join(self.target) + return self._v_file._get_node(target) + else: + return None + + def __getattribute__(self, attrname): + + # get attribute of the SoftLink itself + if (attrname in SoftLink._link_attrnames or + attrname[:3] in SoftLink._link_attrprefixes): + return object.__getattribute__(self, attrname) + + # get attribute of the target node + elif not self._v_isopen: + raise tb.ClosedNodeError('the node object is closed') + elif self.is_dangling(): + return None + else: + target_node = self.dereference() + try: + # __getattribute__() fails to get children of Groups + return target_node.__getattribute__(attrname) + except AttributeError: + # some node classes (e.g. Array) don't implement __getattr__() + return target_node.__getattr__(attrname) + + def __setattr__(self, attrname, value): + + # set attribute of the SoftLink itself + if (attrname in SoftLink._link_attrnames or + attrname[:3] in SoftLink._link_attrprefixes): + object.__setattr__(self, attrname, value) + + # set attribute of the target node + elif not self._v_isopen: + raise tb.ClosedNodeError('the node object is closed') + elif self.is_dangling(): + raise ValueError("softlink target does not exist") + else: + self.dereference().__setattr__(attrname, value) + + def __getitem__(self, key): + """__getitem__ must be defined in the SoftLink class in order for array + indexing syntax to work""" + + if not self._v_isopen: + raise tb.ClosedNodeError('the node object is closed') + elif self.is_dangling(): + raise ValueError("softlink target does not exist") + else: + return self.dereference().__getitem__(key) + + def __setitem__(self, key, value): + """__setitem__ must be defined in the SoftLink class in order for array + indexing syntax to work""" + + if not self._v_isopen: + raise tb.ClosedNodeError('the node object is closed') + elif self.is_dangling(): + raise ValueError("softlink target does not exist") + else: + self.dereference().__setitem__(key, value) + + def is_dangling(self): + return not (self.dereference() in self._v_file) + + def __str__(self): + """Return a short string representation of the link. + + Examples + -------- + + :: + + >>> f = tb.open_file('tables/tests/slink.h5') + >>> f.root.arr2 + /arr2 (SoftLink) -> /arr + >>> f.close() + + """ + + target = str(self.target) + # Check for relative pathnames + if not self.target.startswith('/'): + target = self._v_parent._g_join(self.target) + closed = "" if self._v_isopen else "closed " + dangling = "" if target in self._v_file else " (dangling)" + return (f"{closed}{self._v_pathname} ({self.__class__.__name__}) -> " + f"{self.target}{dangling}") + + +class ExternalLink(linkextension.ExternalLink, Link): + """Represents an external link. + + An external link is a reference to a node in *another* file. + Getting access to the pointed node (this action is called + *dereferencing*) is done via the :meth:`__call__` special method + (see below). + + .. rubric:: ExternalLink attributes + + .. attribute:: extfile + + The external file handler, if the link has been dereferenced. + In case the link has not been dereferenced yet, its value is + None. + + """ + + # Class identifier. + _c_classid = 'EXTERNALLINK' + + def __init__(self, parentnode, name, target=None, _log=False): + self.extfile = None + """The external file handler, if the link has been dereferenced. + In case the link has not been dereferenced yet, its value is + None.""" + super().__init__(parentnode, name, target, _log) + + def _get_filename_node(self): + """Return the external filename and nodepath from `self.target`.""" + + # This is needed for avoiding the 'C:\\file.h5' filepath notation + filename, target = self.target.split(':/') + return filename, '/' + target + + def __call__(self, **kwargs): + """Dereference self.target and return the object. + + You can pass all the arguments supported by the :func:`open_file` + function (except filename, of course) so as to open the referenced + external file. + + Examples + -------- + + :: + + >>> f = tb.open_file('tables/tests/elink.h5') + >>> f.root.pep.pep2 + /pep/pep2 (ExternalLink) -> elink2.h5:/pep + >>> pep2 = f.root.pep.pep2(mode='r') # open in 'r'ead mode + >>> print(pep2) + /pep (Group) '' + >>> pep2._v_file.filename # belongs to referenced file + 'tables/tests/elink2.h5' + >>> f.close() + + """ + + filename, target = self._get_filename_node() + + if not Path(filename).is_absolute(): + # Resolve the external link with respect to the this + # file's directory. See #306. + filename = str(Path(self._v_file.filename).with_name(filename)) + + if self.extfile is None or not self.extfile.isopen: + self.extfile = tb.open_file(filename, **kwargs) + else: + # XXX: implement better consistency checks + assert self.extfile.filename == filename + assert self.extfile.mode == kwargs.get('mode', 'r') + + return self.extfile._get_node(target) + + def umount(self): + """Safely unmount self.extfile, if opened.""" + + extfile = self.extfile + # Close external file, if open + if extfile is not None and extfile.isopen: + extfile.close() + self.extfile = None + + def _f_close(self): + """Especific close for external links.""" + + self.umount() + super()._f_close() + + def __str__(self): + """Return a short string representation of the link. + + Examples + -------- + + :: + + >>> f = tb.open_file('tables/tests/elink.h5') + >>> f.root.pep.pep2 + /pep/pep2 (ExternalLink) -> elink2.h5:/pep + >>> f.close() + + """ + + return (f"{self._v_pathname} ({self.__class__.__name__}) -> " + f"{self.target}") diff --git a/tables/linkextension.pyx b/tables/linkextension.pyx new file mode 100644 index 0000000..6aa8af4 --- /dev/null +++ b/tables/linkextension.pyx @@ -0,0 +1,274 @@ +######################################################################## +# +# License: BSD +# Created: November 25, 2009 +# Author: Francesc Alted - faltet@pytables.com +# +# $Id$ +# +######################################################################## + +"""Cython functions and classes for supporting links in HDF5.""" + +from .exceptions import HDF5ExtError + +from .hdf5extension cimport Node +from .utilsextension cimport cstr_to_pystr + +from libc.stdlib cimport malloc, free +from libc.string cimport strlen +from cpython.unicode cimport PyUnicode_DecodeUTF8 +from .definitions cimport (H5P_DEFAULT, + hid_t, herr_t, hbool_t, int64_t, H5T_cset_t, haddr_t) + + + +#---------------------------------------------------------------------- + +# External declarations + +cdef extern from "H5Lpublic.h" nogil: + + ctypedef enum H5L_type_t: + H5L_TYPE_ERROR = (-1), # Invalid link type id + H5L_TYPE_HARD = 0, # Hard link id + H5L_TYPE_SOFT = 1, # Soft link id + H5L_TYPE_EXTERNAL = 64, # External link id + H5L_TYPE_MAX = 255 # Maximum link type id + + # Information struct for link (for H5Lget_info) + cdef union _add_u: + haddr_t address # Address hard link points to + size_t val_size # Size of a soft link or UD link value + + ctypedef struct H5L_info_t: + H5L_type_t type # Type of link + hbool_t corder_valid # Indicate if creation order is valid + int64_t corder # Creation order + H5T_cset_t cset # Character set of link name + _add_u u # Size of a soft link or UD link value + + # Operations with links + herr_t H5Lcreate_hard( + hid_t obj_loc_id, char *obj_name, hid_t link_loc_id, char *link_name, + hid_t lcpl_id, hid_t lapl_id) + + herr_t H5Lcreate_soft( + char *target_path, hid_t link_loc_id, char *link_name, + hid_t lcpl_id, hid_t lapl_id) + + herr_t H5Lcreate_external( + char *file_name, char *object_name, hid_t link_loc_id, char *link_name, + hid_t lcpl_id, hid_t lapl_id) + + herr_t H5Lget_info( + hid_t link_loc_id, char *link_name, H5L_info_t *link_buff, + hid_t lapl_id) + + herr_t H5Lget_val( + hid_t link_loc_id, char *link_name, void *linkval_buff, size_t size, + hid_t lapl_id) + + herr_t H5Lunpack_elink_val( + char *ext_linkval, size_t link_size, unsigned *flags, + const char **filename, const char **obj_path) + + herr_t H5Lcopy( + hid_t src_loc_id, char *src_name, hid_t dest_loc_id, char *dest_name, + hid_t lcpl_id, hid_t lapl_id) + + +#---------------------------------------------------------------------- + +# Helper functions + +def _get_link_class(parent_id, name): + """Guess the link class.""" + + cdef herr_t ret + cdef H5L_info_t link_buff + cdef H5L_type_t link_type + + ret = H5Lget_info(parent_id, name, &link_buff, H5P_DEFAULT) + if ret < 0: + raise HDF5ExtError("failed to get info about link") + + link_type = link_buff.type + if link_type == H5L_TYPE_SOFT: + return "SoftLink" + elif link_type == H5L_TYPE_EXTERNAL: + return "ExternalLink" + # elif link_type == H5L_TYPE_HARD: + # return "HardLink" + else: + return "UnImplemented" + + + + +def _g_create_hard_link(parentnode, str name, targetnode): + """Create a hard link in the file.""" + + cdef herr_t ret + cdef bytes encoded_name = name.encode('utf-8') + cdef bytes encoded_v_name = targetnode._v_name.encode('utf-8') + + ret = H5Lcreate_hard(targetnode._v_parent._v_objectid, encoded_v_name, + parentnode._v_objectid, encoded_name, + H5P_DEFAULT, H5P_DEFAULT) + if ret < 0: + raise HDF5ExtError("failed to create HDF5 hard link") + + + + +#---------------------------------------------------------------------- + +# Public classes + +cdef class Link(Node): + """Extension class from which all link extensions inherits.""" + + def _g_copy(self, newparent, newname, recursive, _log=True, **kwargs): + """Private part for the _f_copy() method.""" + + cdef herr_t ret + cdef object stats + cdef bytes encoded_name, encoded_newname + + encoded_name = self.name.encode('utf-8') + encoded_newname = newname.encode('utf-8') + + # @TODO: set property list --> utf-8 + ret = H5Lcopy(self.parent_id, encoded_name, newparent._v_objectid, + encoded_newname, H5P_DEFAULT, H5P_DEFAULT) + if ret < 0: + raise HDF5ExtError("failed to copy HDF5 link") + + # Update statistics if needed. + stats = kwargs.get('stats', None) + if stats is not None: + stats['links'] += 1 + + return newparent._v_file.get_node(newparent, newname) + + +cdef class SoftLink(Link): + """Extension class representing a soft link.""" + + def _g_create(self): + """Create the link in file.""" + + cdef herr_t ret + cdef bytes encoded_name = self.name.encode('utf-8') + cdef bytes encoded_target = self.target.encode('utf-8') + + ret = H5Lcreate_soft(encoded_target, self.parent_id, encoded_name, + H5P_DEFAULT, H5P_DEFAULT) + if ret < 0: + raise HDF5ExtError("failed to create HDF5 soft link") + + return 0 # Object ID is zero'ed, as HDF5 does not assign one for links + + def _g_open(self): + """Open the link in file.""" + + cdef herr_t ret + cdef H5L_info_t link_buff + cdef size_t val_size + cdef char *clinkval + cdef bytes encoded_name + + encoded_name = self.name.encode('utf-8') + + ret = H5Lget_info(self.parent_id, encoded_name, &link_buff, H5P_DEFAULT) + if ret < 0: + raise HDF5ExtError("failed to get info about soft link") + + val_size = link_buff.u.val_size + clinkval = malloc(val_size) + + ret = H5Lget_val(self.parent_id, encoded_name, clinkval, val_size, + H5P_DEFAULT) + if ret < 0: + raise HDF5ExtError("failed to get target value") + + self.target = PyUnicode_DecodeUTF8(clinkval, strlen(clinkval), NULL) + + # Release resources + free(clinkval) + return 0 # Object ID is zero'ed, as HDF5 does not assign one for links + + +cdef class ExternalLink(Link): + """Extension class representing an external link.""" + + def _g_create(self): + """Create the link in file.""" + + cdef herr_t ret + cdef bytes encoded_name, encoded_filename, encoded_target + + encoded_name = self.name.encode('utf-8') + + filename, target = self._get_filename_node() + encoded_filename = filename.encode('utf-8') + encoded_target = target.encode('utf-8') + + ret = H5Lcreate_external(encoded_filename, encoded_target, + self.parent_id, encoded_name, + H5P_DEFAULT, H5P_DEFAULT) + if ret < 0: + raise HDF5ExtError("failed to create HDF5 external link") + + return 0 # Object ID is zero'ed, as HDF5 does not assign one for links + + def _g_open(self): + """Open the link in file.""" + + cdef herr_t ret + cdef H5L_info_t link_buff + cdef size_t val_size + cdef char *clinkval + cdef char *cfilename + cdef char *c_obj_path + cdef unsigned flags + cdef bytes encoded_name + cdef str filename, obj_path + + encoded_name = self.name.encode('utf-8') + + ret = H5Lget_info(self.parent_id, encoded_name, &link_buff, H5P_DEFAULT) + if ret < 0: + raise HDF5ExtError("failed to get info about external link") + + val_size = link_buff.u.val_size + clinkval = malloc(val_size) + + ret = H5Lget_val(self.parent_id, encoded_name, clinkval, val_size, + H5P_DEFAULT) + if ret < 0: + raise HDF5ExtError("failed to get target value") + + ret = H5Lunpack_elink_val(clinkval, val_size, &flags, + &cfilename, + &c_obj_path) + if ret < 0: + raise HDF5ExtError("failed to unpack external link value") + + filename = cstr_to_pystr(cfilename) + obj_path = cstr_to_pystr(c_obj_path) + + self.target = filename+':'+obj_path + + # Release resources + free(clinkval) + return 0 # Object ID is zero'ed, as HDF5 does not assign one for links + + +## Local Variables: +## mode: python +## py-indent-offset: 2 +## tab-width: 2 +## fill-column: 78 +## End: diff --git a/tables/lrucacheextension.pxd b/tables/lrucacheextension.pxd new file mode 100644 index 0000000..45edd60 --- /dev/null +++ b/tables/lrucacheextension.pxd @@ -0,0 +1,82 @@ +######################################################################## +# +# License: BSD +# Created: +# Author: Francesc Alted - faltet@pytables.com +# +# $Id$ +# +######################################################################## + +from numpy cimport ndarray + +# Declaration of instance variables for shared classes +# The NodeCache class is useful for caching general objects (like Nodes). +cdef class NodeCache: + cdef readonly long nslots + cdef long nextslot + cdef object nodes, paths + cdef object setitem(self, object path, object node) + cdef long getslot(self, object path) + cdef object cpop(self, object path) + + +# Base class for other caches +cdef class BaseCache: + cdef int iscachedisabled, incsetcount + cdef long setcount, getcount, containscount + cdef long disablecyclecount, disableeverycycles + cdef long enablecyclecount, enableeverycycles + cdef double nprobes, hitratio + cdef long seqn_, nextslot, nslots + cdef long *ratimes + cdef double lowesthr + cdef ndarray atimes + cdef object name + cdef int checkhitratio(self) + cdef int couldenablecache_(self) + cdef long incseqn(self) + + +# Helper class for ObjectCache +cdef class ObjectNode: + cdef object key, obj + cdef long nslot + + +# The ObjectCache class is useful for general python objects +cdef class ObjectCache(BaseCache): + cdef long maxcachesize, cachesize, maxobjsize + cdef long *rsizes + cdef ndarray sizes + cdef object __list, __dict + cdef ObjectNode mrunode + cdef removeslot_(self, long nslot) + cdef clearcache_(self) + cdef updateslot_(self, long nslot, long size, object key, object value) + cdef long setitem_(self, object key, object value, long size) + cdef long getslot_(self, object key) + cdef object getitem_(self, long nslot) + + +# The NumCache class is useful for caching numerical data in an efficient way +cdef class NumCache(BaseCache): + cdef long itemsize, slotsize + cdef ndarray cacheobj, keys + cdef void *rcache + cdef long long *rkeys + cdef object __dict + cdef void *getaddrslot_(self, long nslot) + cdef long setitem_(self, long long key, void *data, long start) + cdef long setitem1_(self, long long key) + cdef long getslot_(self, long long key) + cdef getitem_(self, long nslot, void *data, long start) + cdef void *getitem1_(self, long nslot) + + +## Local Variables: +## mode: python +## py-indent-offset: 2 +## tab-width: 2 +## fill-column: 78 +## End: diff --git a/tables/lrucacheextension.pyx b/tables/lrucacheextension.pyx new file mode 100644 index 0000000..6f58c52 --- /dev/null +++ b/tables/lrucacheextension.pyx @@ -0,0 +1,639 @@ +######################################################################## +# +# License: BSD +# Created: Aug 13, 2006 +# Author: Francesc Alted - faltet@pytables.com +# +# $Id: $ +# +######################################################################## + +"""Cython interface for several LRU cache systems. + +Classes (type extensions): + + NodeCache + ObjectCache + NumCache + +Functions: + +Misc variables: + +""" + +cdef extern from "Python.h": + int PyUnicode_Compare(object, object) + +import sys + +import numpy +from libc.string cimport memcpy, strcmp +from cpython.unicode cimport PyUnicode_Check +from numpy cimport import_array, ndarray, PyArray_DATA + +from .parameters import (DISABLE_EVERY_CYCLES, ENABLE_EVERY_CYCLES, + LOWEST_HIT_RATIO) + + + +#---------------------------------------------------------------------------- +# Initialization code. +# The numpy API requires this function to be called before +# using any numpy facilities in an extension module. +import_array() +#---------------------------------------------------------------------------- + + +# ------- Minimalist NodeCache for nodes in PyTables --------- + +# The next NodeCache code relies on the fact that a node that is +# fetched from the cache will be removed from it. Said in other words: +# "A node cannot be alive and dead at the same time." + +# Thanks to the above behaviour, the next code has been stripped down +# to a bare minimum (the info in cache is kept in just 2 lists). + +#*********************** Important note! ***************************** +# The code behind has been carefully tuned to serve the needs of +# PyTables cache for nodes. As a consequence, it is no longer +# appropriate as a general LRU cache implementation. You have been +# warned!. F. Alted 2006-08-08 +#********************************************************************* + + +cdef class NodeCache: + """Least-Recently-Used (LRU) cache for PyTables nodes.""" + + def __init__(self, nslots): + """Maximum nslots of the cache. + + If more than 'nslots' elements are added to the cache, + the least-recently-used ones will be discarded. + + """ + + if nslots < 0: + raise ValueError("Negative number (%s) of slots!" % nslots) + self.nslots = nslots + self.nextslot = 0 + self.nodes = [] + self.paths = [] + + def __len__(self): + return len(self.nodes) + + def __setitem__(self, path, node): + self.setitem(path, node) + + cdef setitem(self, object path, object node): + """Puts a new node in the node list.""" + + if self.nslots == 0: # Oops, the cache is set to empty + return + # Check if we are growing out of space + if self.nextslot == self.nslots: + # It is critical to reduce nextslot *before* the preemption of + # the LRU node. If not, this can lead with problems in situations + # with very small caches (length 1 or so). + # F. Alted 2008-10-22 + self.nextslot = self.nextslot - 1 + # Remove the LRU node and path (the start of the lists) + del self.nodes[0] + del self.paths[0] + # The equality protection has been put for situations in which a + # node is being preempted and added simultaneously (with very small + # caches). + if len(self.nodes) == len(self.paths): + # Add the node and path to the end of its lists + self.nodes.append(node) + self.paths.append(path) + self.nextslot = self.nextslot + 1 + + def __contains__(self, path): + if self.getslot(path) == -1: + return 0 + else: + return 1 + + cdef long getslot(self, object path): + """Checks whether path is in this cache or not.""" + + cdef long i, nslot, compare + + nslot = -1 # -1 means not found + if PyUnicode_Check(path): + # Start looking from the trailing values (most recently used) + for i from self.nextslot > i >= 0: + #if strcmp(encoded_path, self.paths[i]) == 0: + if PyUnicode_Compare(path, self.paths[i]) == 0: + nslot = i + break + else: + # Start looking from the trailing values (most recently used) + for i from self.nextslot > i >= 0: + #if strcmp(path, self.paths[i]) == 0: + if PyUnicode_Check(self.paths[i]): + compare = PyUnicode_Compare(path, self.paths[i]) + else: + compare = strcmp(path, self.paths[i]) + if compare == 0: + nslot = i + break + + return nslot + + __marker = object() + + def pop(self, path, d=__marker): + try: + node = self.cpop(path) + except KeyError: + if d is not self.__marker: + return d + else: + raise + else: + return node + + cdef object cpop(self, object path): + cdef long nslot + + nslot = self.getslot(path) + if nslot == -1: + raise KeyError(path) + else: + node = self.nodes[nslot] + del self.nodes[nslot] + del self.paths[nslot] + self.nextslot = self.nextslot - 1 + return node + + def __iter__(self): + # Do a copy of the paths list because it can be modified in the middle of + # the iterator! + copy = self.paths[:] + return iter(copy) + + def __repr__(self): + return "<%s (%d elements)>" % (str(self.__class__), len(self.paths)) + + +######################################################################## +# Common code for other LRU cache classes +######################################################################## + +cdef class BaseCache: + """Base class that implements automatic probing/disabling of the cache.""" + + def __init__(self, long nslots, object name): + + if nslots < 0: + raise ValueError("Negative number (%s) of slots!" % nslots) + self.setcount = 0; self.getcount = 0; self.containscount = 0 + self.enablecyclecount = 0; self.disablecyclecount = 0 + self.iscachedisabled = False # Cache is enabled by default + self.disableeverycycles = DISABLE_EVERY_CYCLES + self.enableeverycycles = ENABLE_EVERY_CYCLES + self.lowesthr = LOWEST_HIT_RATIO + self.nprobes = 0.0; self.hitratio = 0.0 + self.nslots = nslots + self.seqn_ = 0; self.nextslot = 0 + self.name = name + self.incsetcount = False + # The array for keeping the access times (using long ints here) + self.atimes = numpy.zeros(shape=nslots, dtype=numpy.int_) + self.ratimes = PyArray_DATA(self.atimes) + + def __len__(self): + return self.nslots + + # Machinery for determining whether the hit ratio is being effective + # or not. If not, the cache will be disabled. The efficency will be + # checked every cycle (the time that the cache would be refilled + # completely). In situations where the cache is not being re-filled + # (i.e. it is not enabled) for a long time, it is forced to be + # re-enabled when a certain number of cycles has passed so as to + # check whether a new scenario where the cache can be useful again + # has come. + # F. Alted 2006-08-09 + cdef int checkhitratio(self): + cdef double hitratio + cdef long nslot + + if self.setcount > self.nslots: + self.disablecyclecount = self.disablecyclecount + 1 + self.enablecyclecount = self.enablecyclecount + 1 + self.nprobes = self.nprobes + 1 + hitratio = self.getcount / self.containscount + self.hitratio = self.hitratio + hitratio + # Reset the hit counters + self.setcount = 0; self.getcount = 0; self.containscount = 0 + if (not self.iscachedisabled and + self.disablecyclecount >= self.disableeverycycles): + # Check whether the cache is being effective or not + if hitratio < self.lowesthr: + # Hit ratio is low. Disable the cache. + self.iscachedisabled = True + else: + # Hit ratio is acceptable. (Re-)Enable the cache. + self.iscachedisabled = False + self.disablecyclecount = 0 + if self.enablecyclecount >= self.enableeverycycles: + # We have reached the time for forcing the cache to act again + self.iscachedisabled = False + self.enablecyclecount = 0 + return not self.iscachedisabled + + def couldenablecache(self): + return self.couldenablecache_() + + # Check whether the cache is enabled or *could* be enabled in the next + # setitem operation. This method can be used in order to probe whether + # an (expensive) operation to be done before a .setitem() is worth the + # effort or not. + cdef int couldenablecache_(self): + + if self.nslots == 0: + return False + # Increment setitem because it can be that .setitem() doesn't + # get called after calling this. + self.setcount = self.setcount + 1; self.incsetcount = True + if self.iscachedisabled: + if self.setcount == self.nslots: + # The cache *could* be enabled in the next setitem operation + return True + else: + return False + else: + return True + + # Increase the access time (implemented as a C long sequence) + cdef long incseqn(self): + + self.seqn_ = self.seqn_ + 1 + if self.seqn_ < 0: + # Ooops, the counter has run out of range! Reset all the access times. + self.atimes[:] = sys.maxint + # Set the counter to 1 (to indicate that it is newer than existing ones) + self.seqn_ = 1 + return self.seqn_ + + def __repr__(self): + return "<%s(%s) (%d elements)>" % (self.name, str(self.__class__), + self.nslots) + + +######################################################################## +# Helper class for ObjectCache +######################################################################## + +cdef class ObjectNode: + """Record of a cached value. Not for public consumption.""" + + def __init__(self, object key, object obj, long nslot): + object.__init__(self) + self.key = key + self.obj = obj + self.nslot = nslot + + def __repr__(self): + return "<%s %s (slot #%s) => %s>" % (self.__class__, self.key, self.nslot, + self.object) + + +######################################################################## +# Minimalistic LRU cache implementation for general python objects +# This is a *true* general lru cache for python objects +######################################################################## + +cdef class ObjectCache(BaseCache): + """Least-Recently-Used (LRU) cache specific for python objects.""" + + def __init__(self, long nslots, long maxcachesize, object name): + """Maximum size of the cache. + + If more than 'nslots' elements are added to the cache, + the least-recently-used ones will be discarded. + + Parameters: + nslots - The number of slots in cache + name - A descriptive name for this cache + + """ + + super().__init__(nslots, name) + self.cachesize = 0 + self.maxcachesize = maxcachesize + # maxobjsize will be the same as the maximum cache size + self.maxobjsize = maxcachesize + self.__list = [None]*nslots + self.__dict = {} + self.mrunode = None # Most Recent Used node + # The array for keeping the object size (using long ints here) + self.sizes = numpy.zeros(shape=nslots, dtype=numpy.int_) + self.rsizes = PyArray_DATA(self.sizes) + + # Clear cache + cdef clearcache_(self): + self.__list = [None]*self.nslots + self.__dict = {} + self.mrunode = None + self.cachesize = 0 + self.nextslot = 0 + self.seqn_ = 0 + + # Remove a slot (if it exists in cache) + cdef removeslot_(self, long nslot): + cdef ObjectNode node + + assert nslot < self.nslots, "Attempting to remove beyond cache capacity." + node = self.__list[nslot] + if node is not None: + self.__list[nslot] = None + del self.__dict[node.key] + self.cachesize = self.cachesize - self.rsizes[nslot] + self.rsizes[nslot] = 0 + if self.mrunode and self.mrunode.nslot == nslot: + self.mrunode = None + # The next slot to be updated will be this one + self.nextslot = nslot + + # Update a slot + cdef updateslot_(self, long nslot, long size, object key, object value): + cdef ObjectNode node, oldnode + cdef long nslot1, nslot2 + cdef object lruidx + + assert nslot < self.nslots, "Number of nodes exceeding cache capacity." + # Remove the previous nslot + self.removeslot_(nslot) + # Protection against too large data cache size + while size + self.cachesize > self.maxcachesize: + # Remove the LRU node among the 10 largest ones + largidx = self.sizes.argsort()[-10:] + nslot1 = self.atimes[largidx].argmin() + nslot2 = largidx[nslot1] + self.removeslot_(nslot2) + # Insert the new one + node = ObjectNode(key, value, nslot) + self.ratimes[nslot] = self.incseqn() + self.rsizes[nslot] = size + self.__list[nslot] = node + self.__dict[key] = node + self.mrunode = node + self.cachesize = self.cachesize + size + # The next slot to update will be the LRU + self.nextslot = self.atimes.argmin() + + # Put the object to the data in cache (for Python calls) + def setitem(self, object key, object value, object size): + return self.setitem_(key, value, size) + + # Put the object in cache (for cython calls) + # size can be the exact size of the value object or an estimation. + cdef long setitem_(self, object key, object value, long size): + cdef long nslot + + if self.nslots == 0: # The cache has been set to empty + return -1 + nslot = -1 + # Perhaps setcount has been already incremented in couldenablecache() + if not self.incsetcount: + self.setcount = self.setcount + 1 + else: + self.incsetcount = False + if size > self.maxobjsize: # Check if the object is too large + return -1 + if self.checkhitratio(): + nslot = self.nextslot + self.updateslot_(nslot, size, key, value) + else: + # Empty the cache because it is not effective and it is taking space + self.clearcache_() + return nslot + + # Tells whether the key is in cache or not + def __contains__(self, object key): + return self.__dict.has_key(key) + + # Tells in which slot the key is. If not found, -1 is returned. + def getslot(self, object key): + return self.getslot_(key) + + # Tells in which slot the key is. If not found, -1 is returned. + cdef long getslot_(self, object key): + cdef ObjectNode node + + if self.nslots == 0: # The cache has been set to empty + return -1 + self.containscount = self.containscount + 1 + # Give a chance to the MRU node + node = self.mrunode + if node and node.key == key: + return node.nslot + # No luck. Look in the dictionary. + node = self.__dict.get(key) + if node is None: + return -1 + return node.nslot + + # Return the object to the data in cache (for Python calls) + def getitem(self, object nslot): + return self.getitem_(nslot) + + # Return the object to the data in cache (for cython calls) + cdef object getitem_(self, long nslot): + cdef ObjectNode node + + self.getcount = self.getcount + 1 + node = self.__list[nslot] + self.ratimes[nslot] = self.incseqn() + self.mrunode = node + return node.obj + + def __repr__(self): + if self.nprobes > 0: + hitratio = self.hitratio / self.nprobes + else: + hitratio = self.getcount / self.containscount + return """<%s(%s) + (%d maxslots, %d slots used, %.3f KB cachesize, + hit ratio: %.3f, disabled? %s)> + """ % (self.name, str(self.__class__), self.nslots, self.nextslot, + self.cachesize / 1024., hitratio, self.iscachedisabled) + + +################################################################### +# Minimalistic LRU cache implementation for numerical data +################################################################### +# The next code is more efficient in situations where efficiency is low. +################################################################### + +#*********************** Important note! **************************** +# The code behind has been carefully tuned to serve the needs of +# caching numerical data. As a consequence, it is no longer appropriate +# as a general LRU cache implementation. You have been warned!. +# F. Alted 2006-08-09 +#******************************************************************** + +cdef class NumCache(BaseCache): + """Least-Recently-Used (LRU) cache specific for Numerical data.""" + + def __init__(self, object shape, object dtype, object name): + """Maximum size of the cache. + + If more than 'nslots' elements are added to the cache, + the least-recently-used ones will be discarded. + + Parameters: + shape - The rectangular shape of the cache (nslots, nelemsperslot) + itemsize - The size of the element base in cache + name - A descriptive name for this cache + + """ + + cdef long nslots + + nslots = shape[0]; self.slotsize = shape[1] + if nslots >= 1<<16: + # nslots can't be higher than 2**16. Will silently trunk the number. + nslots = ((1<<16)-1) # Cast makes cython happy here + super().__init__(nslots, name) + self.itemsize = dtype.itemsize + self.__dict = {} + # The cache object where all data will go + # The last slot is to allow the setitem1_ method to still return + # a valid scratch area for writing purposes + self.cacheobj = numpy.empty(shape=(nslots+1, self.slotsize), + dtype=dtype) + self.rcache = PyArray_DATA(self.cacheobj) + # The array for keeping the keys of slots + self.keys = (-numpy.ones(shape=nslots, dtype=numpy.int64)) + self.rkeys = PyArray_DATA(self.keys) + + # Returns the address of nslot + cdef void *getaddrslot_(self, long nslot): + if nslot >= 0: + return self.rcache + nslot * self.slotsize * self.itemsize + else: + return self.rcache + self.nslots * self.slotsize * self.itemsize + + def setitem(self, long long key, ndarray nparr, long start): + return self.setitem_(key, PyArray_DATA(nparr), start) + + # Copy the new data into a cache slot + cdef long setitem_(self, long long key, void *data, long start): + cdef long nslot + + nslot = self.setitem1_(key) + if nslot >= 0: + # Copy the data to cache + memcpy(self.rcache + nslot * self.slotsize * self.itemsize, + data + start * self.itemsize, + self.slotsize * self.itemsize) + return nslot + + # Return a cache data pointer appropriate to save data. + # Even if the cache is disabled, this will return a -1, which is + # the last element in the cache. + # This version avoids a memcpy of data, but the user should be + # aware that data in nslot cannot be overwritten! + cdef long setitem1_(self, long long key): + cdef long nslot + cdef object key2 + + if self.nslots == 0: # Oops, the cache is set to empty + return -1 + # Perhaps setcount has been already incremented in couldenablecache() + if not self.incsetcount: + self.setcount = self.setcount + 1 + else: + self.incsetcount = False + nslot = -1 + if self.checkhitratio(): + # Check if we are growing out of space + if self.nextslot == self.nslots: + # Get the least recently used slot + nslot = self.atimes.argmin() + # Remove the slot from the dict + key2 = self.keys[nslot] + del self.__dict[key2] + self.nextslot = self.nextslot - 1 + else: + # Get the next slot available + nslot = self.nextslot + # Insert the slot in the dictionary + self.__dict[key] = nslot + self.keys[nslot] = key + self.ratimes[nslot] = self.incseqn() + self.nextslot = self.nextslot + 1 + # The next reduces the performance of the cache in scenarios where + # the efficicency is near to zero. I don't understand exactly why. + # F. Alted 24-03-2008 + elif self.nextslot > 0: + # Empty the cache if needed + self.__dict.clear() + self.nextslot = 0 + return nslot + + def getslot(self, long long key): + return self.getslot_(key) + + # Tells in which slot key is. If not found, -1 is returned. + cdef long getslot_(self, long long key): + cdef object nslot + + self.containscount = self.containscount + 1 + if self.nextslot == 0: # No chances for finding a slot + return -1 + try: + nslot = self.__dict[key] + except KeyError: + return -1 + return nslot + + def getitem(self, long nslot, ndarray nparr, long start): + self.getitem_(nslot, PyArray_DATA(nparr), start) + + # This version copies data in cache to data+start. + # The user should be responsible to provide a large enough data buffer + # to keep all the data. + cdef getitem_(self, long nslot, void *data, long start): + cdef void *cachedata + + cachedata = self.getitem1_(nslot) + # Copy the data in cache to destination + memcpy(data + start * self.itemsize, cachedata, + self.slotsize * self.itemsize) + + # Return the pointer to the data in cache + # This version avoids a memcpy of data, but the user should be + # aware that data in nslot cannot be overwritten! + cdef void *getitem1_(self, long nslot): + + self.getcount = self.getcount + 1 + self.ratimes[nslot] = self.incseqn() + return self.rcache + nslot * self.slotsize * self.itemsize + + def __repr__(self): + cachesize = (self.nslots * self.slotsize * self.itemsize) / 1024. + if self.nprobes > 0: + hitratio = self.hitratio / self.nprobes + elif self.containscount > 0: + hitratio = self.getcount / self.containscount + else: + hitratio = numpy.nan + return """<%s(%s) + (%d maxslots, %d slots used, %.3f KB cachesize, + hit ratio: %.3f, disabled? %s)> + """ % (self.name, str(self.__class__), self.nslots, self.nextslot, + cachesize, hitratio, self.iscachedisabled) + + +## Local Variables: +## mode: python +## py-indent-offset: 2 +## tab-width: 2 +## fill-column: 78 +## End: diff --git a/tables/misc/__init__.py b/tables/misc/__init__.py new file mode 100644 index 0000000..efbacf9 --- /dev/null +++ b/tables/misc/__init__.py @@ -0,0 +1,6 @@ +"""Miscellaneous general-purpose modules + +The purpose, authorship and license of modules in this package is +diverse, and they may be useful outside of PyTables. Please read +their source code for further information. +""" diff --git a/tables/misc/enum.py b/tables/misc/enum.py new file mode 100644 index 0000000..39c2cb8 --- /dev/null +++ b/tables/misc/enum.py @@ -0,0 +1,436 @@ +"""Implementation of enumerated types. + +This module provides the `Enum` class, which can be used to construct +enumerated types. Those types are defined by providing an *exhaustive +set or list* of possible, named values for a variable of that type. +Enumerated variables of the same type are usually compared between them +for equality and sometimes for order, but are not usually operated upon. + +Enumerated values have an associated *name* and *concrete value*. Every +name is unique and so are concrete values. An enumerated variable +always takes the concrete value, not its name. Usually, the concrete +value is not used directly, and frequently it is entirely irrelevant. +For the same reason, an enumerated variable is not usually compared with +concrete values out of its enumerated type. For that kind of use, +standard variables and constants are more adequate. + +""" + + +__docformat__ = 'reStructuredText' +"""The format of documentation strings in this module.""" + + +class Enum: + """Enumerated type. + + Each instance of this class represents an enumerated type. The + values of the type must be declared + *exhaustively* and named with + *strings*, and they might be given explicit + concrete values, though this is not compulsory. Once the type is + defined, it can not be modified. + + There are three ways of defining an enumerated type. Each one + of them corresponds to the type of the only argument in the + constructor of Enum: + + - *Sequence of names*: each enumerated + value is named using a string, and its order is determined by + its position in the sequence; the concrete value is assigned + automatically:: + + >>> boolEnum = Enum(['True', 'False']) + + - *Mapping of names*: each enumerated + value is named by a string and given an explicit concrete value. + All of the concrete values must be different, or a + ValueError will be raised:: + + >>> priority = Enum({'red': 20, 'orange': 10, 'green': 0}) + >>> colors = Enum({'red': 1, 'blue': 1}) + Traceback (most recent call last): + ... + ValueError: enumerated values contain duplicate concrete values: 1 + + - *Enumerated type*: in that case, a copy + of the original enumerated type is created. Both enumerated + types are considered equal:: + + >>> prio2 = Enum(priority) + >>> priority == prio2 + True + + Please note that names starting with _ are + not allowed, since they are reserved for internal usage:: + + >>> prio2 = Enum(['_xx']) + Traceback (most recent call last): + ... + ValueError: name of enumerated value can not start with ``_``: '_xx' + + The concrete value of an enumerated value is obtained by + getting its name as an attribute of the Enum + instance (see __getattr__()) or as an item (see + __getitem__()). This allows comparisons between + enumerated values and assigning them to ordinary Python + variables:: + + >>> redv = priority.red + >>> redv == priority['red'] + True + >>> redv > priority.green + True + >>> priority.red == priority.orange + False + + The name of the enumerated value corresponding to a concrete + value can also be obtained by using the + __call__() method of the enumerated type. In this + way you get the symbolic name to use it later with + __getitem__():: + + >>> priority(redv) + 'red' + >>> priority.red == priority[priority(priority.red)] + True + + (If you ask, the __getitem__() method is + not used for this purpose to avoid ambiguity in the case of using + strings as concrete values.) + + """ + + def __init__(self, enum): + mydict = self.__dict__ + + mydict['_names'] = {} + mydict['_values'] = {} + + if isinstance(enum, list) or isinstance(enum, tuple): + for (value, name) in enumerate(enum): # values become 0, 1, 2... + self._check_and_set_pair(name, value) + elif isinstance(enum, dict): + for (name, value) in enum.items(): + self._check_and_set_pair(name, value) + elif isinstance(enum, Enum): + for (name, value) in enum._names.items(): + self._check_and_set_pair(name, value) + else: + raise TypeError("""\ +enumerations can only be created from \ +sequences, mappings and other enumerations""") + + def _check_and_set_pair(self, name, value): + """Check validity of enumerated value and insert it into type.""" + + names = self._names + values = self._values + + if not isinstance(name, str): + raise TypeError( + f"name of enumerated value is not a string: {name!r}") + if name.startswith('_'): + raise ValueError( + "name of enumerated value can not start with ``_``: %r" + % name) + # This check is only necessary with a sequence base object. + if name in names: + raise ValueError( + "enumerated values contain duplicate names: %r" % name) + # This check is only necessary with a mapping base object. + if value in values: + raise ValueError( + "enumerated values contain duplicate concrete values: %r" + % value) + + names[name] = value + values[value] = name + self.__dict__[name] = value + + def __getitem__(self, name): + """Get the concrete value of the enumerated value with that name. + + The name of the enumerated value must be a string. If there is no value + with that name in the enumeration, a KeyError is raised. + + Examples + -------- + + Let ``enum`` be an enumerated type defined as: + + >>> enum = Enum({'T0': 0, 'T1': 2, 'T2': 5}) + + then: + + >>> enum['T1'] + 2 + >>> enum['foo'] + Traceback (most recent call last): + ... + KeyError: "no enumerated value with that name: 'foo'" + + """ + + try: + return self._names[name] + except KeyError: + raise KeyError(f"no enumerated value with that name: {name!r}") + + def __setitem__(self, name, value): + """This operation is forbidden.""" + raise IndexError("operation not allowed") + + def __delitem__(self, name): + """This operation is forbidden.""" + raise IndexError("operation not allowed") + + def __getattr__(self, name): + """Get the concrete value of the enumerated value with that name. + + The name of the enumerated value must be a string. If there is no value + with that name in the enumeration, an AttributeError is raised. + + Examples + -------- + Let ``enum`` be an enumerated type defined as: + + >>> enum = Enum({'T0': 0, 'T1': 2, 'T2': 5}) + + then: + + >>> enum.T1 + 2 + >>> enum.foo + Traceback (most recent call last): + ... + AttributeError: no enumerated value with that name: 'foo' + + """ + + try: + return self[name] + except KeyError as ke: + raise AttributeError(*ke.args) + + def __setattr__(self, name, value): + """This operation is forbidden.""" + raise AttributeError("operation not allowed") + + def __delattr__(self, name): + """This operation is forbidden.""" + raise AttributeError("operation not allowed") + + def __contains__(self, name): + """Is there an enumerated value with that name in the type? + + If the enumerated type has an enumerated value with that name, True is + returned. Otherwise, False is returned. The name must be a string. + + This method does *not* check for concrete values matching a value in an + enumerated type. For that, please use the :meth:`Enum.__call__` method. + + Examples + -------- + Let ``enum`` be an enumerated type defined as: + + >>> enum = Enum({'T0': 0, 'T1': 2, 'T2': 5}) + + then: + + >>> 'T1' in enum + True + >>> 'foo' in enum + False + >>> 0 in enum + Traceback (most recent call last): + ... + TypeError: name of enumerated value is not a string: 0 + >>> enum.T1 in enum # Be careful with this! + Traceback (most recent call last): + ... + TypeError: name of enumerated value is not a string: 2 + + """ + + if not isinstance(name, str): + raise TypeError( + f"name of enumerated value is not a string: {name!r}") + return name in self._names + + def __call__(self, value, *default): + """Get the name of the enumerated value with that concrete value. + + If there is no value with that concrete value in the enumeration and a + second argument is given as a default, this is returned. Else, a + ValueError is raised. + + This method can be used for checking that a concrete value belongs to + the set of concrete values in an enumerated type. + + Examples + -------- + Let ``enum`` be an enumerated type defined as: + + >>> enum = Enum({'T0': 0, 'T1': 2, 'T2': 5}) + + then: + + >>> enum(5) + 'T2' + >>> enum(42, None) is None + True + >>> enum(42) + Traceback (most recent call last): + ... + ValueError: no enumerated value with that concrete value: 42 + + """ + + try: + return self._values[value] + except KeyError: + if len(default) > 0: + return default[0] + raise ValueError( + f"no enumerated value with that concrete value: {value!r}") + + def __len__(self): + """Return the number of enumerated values in the enumerated type. + + Examples + -------- + >>> len(Enum(['e%d' % i for i in range(10)])) + 10 + + """ + + return len(self._names) + + def __iter__(self): + """Iterate over the enumerated values. + + Enumerated values are returned as (name, value) pairs *in no particular + order*. + + Examples + -------- + >>> enumvals = {'red': 4, 'green': 2, 'blue': 1} + >>> enum = Enum(enumvals) + >>> enumdict = dict([(name, value) for (name, value) in enum]) + >>> enumvals == enumdict + True + + """ + + yield from self._names.items() + + def __eq__(self, other): + """Is the other enumerated type equivalent to this one? + + Two enumerated types are equivalent if they have exactly the same + enumerated values (i.e. with the same names and concrete values). + + Examples + -------- + + Let ``enum*`` be enumerated types defined as: + + >>> enum1 = Enum({'T0': 0, 'T1': 2}) + >>> enum2 = Enum(enum1) + >>> enum3 = Enum({'T1': 2, 'T0': 0}) + >>> enum4 = Enum({'T0': 0, 'T1': 2, 'T2': 5}) + >>> enum5 = Enum({'T0': 0}) + >>> enum6 = Enum({'T0': 10, 'T1': 20}) + + then: + + >>> enum1 == enum1 + True + >>> enum1 == enum2 == enum3 + True + >>> enum1 == enum4 + False + >>> enum5 == enum1 + False + >>> enum1 == enum6 + False + + Comparing enumerated types with other kinds of objects produces + a false result: + + >>> enum1 == {'T0': 0, 'T1': 2} + False + >>> enum1 == ['T0', 'T1'] + False + >>> enum1 == 2 + False + + """ + + if not isinstance(other, Enum): + return False + return self._names == other._names + + def __ne__(self, other): + """Is the `other` enumerated type different from this one? + + Two enumerated types are different if they don't have exactly + the same enumerated values (i.e. with the same names and + concrete values). + + Examples + -------- + + Let ``enum*`` be enumerated types defined as: + + >>> enum1 = Enum({'T0': 0, 'T1': 2}) + >>> enum2 = Enum(enum1) + >>> enum3 = Enum({'T1': 2, 'T0': 0}) + >>> enum4 = Enum({'T0': 0, 'T1': 2, 'T2': 5}) + >>> enum5 = Enum({'T0': 0}) + >>> enum6 = Enum({'T0': 10, 'T1': 20}) + + then: + + >>> enum1 != enum1 + False + >>> enum1 != enum2 != enum3 + False + >>> enum1 != enum4 + True + >>> enum5 != enum1 + True + >>> enum1 != enum6 + True + + """ + + return not self.__eq__(other) + + # XXX: API incompatible change for PyTables 3 line + # Overriding __eq__ blocks inheritance of __hash__ in 3.x + # def __hash__(self): + # return hash((self.__class__, tuple(self._names.items()))) + def __repr__(self): + """Return the canonical string representation of the enumeration. The + output of this method can be evaluated to give a new enumeration object + that will compare equal to this one. + + Examples + -------- + >>> repr(Enum({'name': 10})) + "Enum({'name': 10})" + + """ + + return 'Enum(%s)' % self._names + + +def _test(): + import doctest + return doctest.testmod() + + +if __name__ == '__main__': + _test() diff --git a/tables/misc/proxydict.py b/tables/misc/proxydict.py new file mode 100644 index 0000000..df288ef --- /dev/null +++ b/tables/misc/proxydict.py @@ -0,0 +1,58 @@ +"""Proxy dictionary for objects stored in a container.""" +import weakref + + +class ProxyDict(dict): + """A dictionary which uses a container object to store its values.""" + + def __init__(self, container): + self.containerref = weakref.ref(container) + """A weak reference to the container object. + + .. versionchanged:: 3.0 + The *containerRef* attribute has been renamed into + *containerref*. + + """ + + def __getitem__(self, key): + if key not in self: + raise KeyError(key) + + # Values are not actually stored to avoid extra references. + return self._get_value_from_container(self._get_container(), key) + + def __setitem__(self, key, value): + # Values are not actually stored to avoid extra references. + super().__setitem__(key, None) + + def __repr__(self): + return object.__repr__(self) + + def __str__(self): + # C implementation does not use `self.__getitem__()`. :( + return '{' + ", ".join("{k!r}: {v!r}" for k, v in self.items()) + '}' + + def values(self): + # C implementation does not use `self.__getitem__()`. :( + return [self[key] for key in self.keys()] + + def itervalues(self): + # C implementation does not use `self.__getitem__()`. :( + for key in self.keys(): + yield self[key] + + def items(self): + # C implementation does not use `self.__getitem__()`. :( + return [(key, self[key]) for key in self.keys()] + + def iteritems(self): + # C implementation does not use `self.__getitem__()`. :( + for key in self.keys(): + yield (key, self[key]) + + def _get_container(self): + container = self.containerref() + if container is None: + raise ValueError("the container object does no longer exist") + return container diff --git a/tables/node.py b/tables/node.py new file mode 100644 index 0000000..ec687bc --- /dev/null +++ b/tables/node.py @@ -0,0 +1,892 @@ +"""PyTables nodes.""" + +import warnings +import functools + +from .registry import class_name_dict, class_id_dict +from .exceptions import (ClosedNodeError, NodeError, UndoRedoWarning, + PerformanceWarning) +from .path import join_path, split_path, isvisiblepath +from .utils import lazyattr +from .undoredo import move_to_shadow +from .attributeset import AttributeSet, NotLoggedAttributeSet + + +__docformat__ = 'reStructuredText' +"""The format of documentation strings in this module.""" + + +def _closedrepr(oldmethod): + """Decorate string representation method to handle closed nodes. + + If the node is closed, a string like this is returned:: + + + + instead of calling `oldmethod` and returning its result. + + """ + + @functools.wraps(oldmethod) + def newmethod(self): + if not self._v_isopen: + return (f'') + return oldmethod(self) + + return newmethod + + +class MetaNode(type): + """Node metaclass. + + This metaclass ensures that their instance classes get registered + into several dictionaries (namely the `tables.utils.class_name_dict` + class name dictionary and the `tables.utils.class_id_dict` class + identifier dictionary). + + It also adds sanity checks to some methods: + + * Check that the node is open when calling string representation + and provide a default string if so. + + """ + + def __new__(mcs, name, bases, dict_): + # Add default behaviour for representing closed nodes. + for mname in ['__str__', '__repr__']: + if mname in dict_: + dict_[mname] = _closedrepr(dict_[mname]) + + return type.__new__(mcs, name, bases, dict_) + + def __init__(cls, name, bases, dict_): + super().__init__(name, bases, dict_) + + # Always register into class name dictionary. + class_name_dict[cls.__name__] = cls + + # Register into class identifier dictionary only if the class + # has an identifier and it is different from its parents'. + cid = getattr(cls, '_c_classid', None) + if cid is not None: + for base in bases: + pcid = getattr(base, '_c_classid', None) + if pcid == cid: + break + else: + class_id_dict[cid] = cls + + +class Node(metaclass=MetaNode): + """Abstract base class for all PyTables nodes. + + This is the base class for *all* nodes in a PyTables hierarchy. It is an + abstract class, i.e. it may not be directly instantiated; however, every + node in the hierarchy is an instance of this class. + + A PyTables node is always hosted in a PyTables *file*, under a *parent + group*, at a certain *depth* in the node hierarchy. A node knows its own + *name* in the parent group and its own *path name* in the file. + + All the previous information is location-dependent, i.e. it may change when + moving or renaming a node in the hierarchy. A node also has + location-independent information, such as its *HDF5 object identifier* and + its *attribute set*. + + This class gathers the operations and attributes (both location-dependent + and independent) which are common to all PyTables nodes, whatever their + type is. Nonetheless, due to natural naming restrictions, the names of all + of these members start with a reserved prefix (see the Group class + in :ref:`GroupClassDescr`). + + Sub-classes with no children (e.g. *leaf nodes*) may define new methods, + attributes and properties to avoid natural naming restrictions. For + instance, _v_attrs may be shortened to attrs and _f_rename to + rename. However, the original methods and attributes should still be + available. + + .. rubric:: Node attributes + + .. attribute:: _v_depth + + The depth of this node in the tree (an non-negative integer value). + + .. attribute:: _v_file + + The hosting File instance (see :ref:`FileClassDescr`). + + .. attribute:: _v_name + + The name of this node in its parent group (a string). + + .. attribute:: _v_pathname + + The path of this node in the tree (a string). + + .. attribute:: _v_objectid + + A node identifier (may change from run to run). + + .. versionchanged:: 3.0 + The *_v_objectID* attribute has been renamed into *_v_object_id*. + + """ + + # By default, attributes accept Undo/Redo. + _AttributeSet = AttributeSet + + # `_v_parent` is accessed via its file to avoid upwards references. + def _g_getparent(self): + """The parent :class:`Group` instance""" + (parentpath, nodename) = split_path(self._v_pathname) + return self._v_file._get_node(parentpath) + + _v_parent = property(_g_getparent) + + # '_v_attrs' is defined as a lazy read-only attribute. + # This saves 0.7s/3.8s. + @lazyattr + def _v_attrs(self): + """The associated `AttributeSet` instance. + + See Also + -------- + tables.attributeset.AttributeSet : container for the HDF5 attributes + + """ + + return self._AttributeSet(self) + + # '_v_title' is a direct read-write shorthand for the 'TITLE' attribute + # with the empty string as a default value. + def _g_gettitle(self): + """A description of this node. A shorthand for TITLE attribute.""" + if hasattr(self._v_attrs, 'TITLE'): + return self._v_attrs.TITLE + else: + return '' + + def _g_settitle(self, title): + self._v_attrs.TITLE = title + + _v_title = property(_g_gettitle, _g_settitle) + + # This may be looked up by ``__del__`` when ``__init__`` doesn't get + # to be called. See ticket #144 for more info. + _v_isopen = False + """Whehter this node is open or not.""" + + # The ``_log`` argument is only meant to be used by ``_g_copy_as_child()`` + # to avoid logging the creation of children nodes of a copied sub-tree. + def __init__(self, parentnode, name, _log=True): + # Remember to assign these values in the root group constructor + # as it does not use this method implementation! + + # if the parent node is a softlink, dereference it + if isinstance(parentnode, class_name_dict['SoftLink']): + parentnode = parentnode.dereference() + + self._v_file = None + """The hosting File instance (see :ref:`FileClassDescr`).""" + + self._v_isopen = False + """Whether this node is open or not.""" + + self._v_pathname = None + """The path of this node in the tree (a string).""" + + self._v_name = None + """The name of this node in its parent group (a string).""" + + self._v_depth = None + """The depth of this node in the tree (an non-negative integer value). + """ + + self._v_maxtreedepth = parentnode._v_file.params['MAX_TREE_DEPTH'] + """Maximum tree depth before warning the user. + + .. versionchanged:: 3.0 + Renamed into *_v_maxtreedepth* from *_v_maxTreeDepth*. + + """ + + self._v__deleting = False + """Is the node being deleted?""" + + self._v_objectid = None + """A node identifier (may change from run to run). + + .. versionchanged:: 3.0 + The *_v_objectID* attribute has been renamed into *_v_objectid*. + + """ + + validate = new = self._v_new # set by subclass constructor + + # Is the parent node a group? Is it open? + self._g_check_group(parentnode) + parentnode._g_check_open() + file_ = parentnode._v_file + + # Will the file be able to host a new node? + if new: + file_._check_writable() + + # Bind to the parent node and set location-dependent information. + if new: + # Only new nodes need to be referenced. + # Opened nodes are already known by their parent group. + parentnode._g_refnode(self, name, validate) + self._g_set_location(parentnode, name) + + try: + # hdf5extension operations: + # Update node attributes. + self._g_new(parentnode, name, init=True) + # Create or open the node and get its object ID. + if new: + self._v_objectid = self._g_create() + else: + self._v_objectid = self._g_open() + + # The node *has* been created, log that. + if new and _log and file_.is_undo_enabled(): + self._g_log_create() + + # This allows extra operations after creating the node. + self._g_post_init_hook() + except Exception: + # If anything happens, the node must be closed + # to undo every possible registration made so far. + # We do *not* rely on ``__del__()`` doing it later, + # since it might never be called anyway. + self._f_close() + raise + + def _g_log_create(self): + self._v_file._log('CREATE', self._v_pathname) + + def __del__(self): + # Closed `Node` instances can not be killed and revived. + # Instead, accessing a closed and deleted (from memory, not + # disk) one yields a *new*, open `Node` instance. This is + # because of two reasons: + # + # 1. Predictability. After closing a `Node` and deleting it, + # only one thing can happen when accessing it again: a new, + # open `Node` instance is returned. If closed nodes could be + # revived, one could get either a closed or an open `Node`. + # + # 2. Ease of use. If the user wants to access a closed node + # again, the only condition would be that no references to + # the `Node` instance were left. If closed nodes could be + # revived, the user would also need to force the closed + # `Node` out of memory, which is not a trivial task. + # + + if not self._v_isopen: + return # the node is already closed or not initialized + + self._v__deleting = True + + # If we get here, the `Node` is still open. + try: + node_manager = self._v_file._node_manager + node_manager.drop_node(self, check_unregistered=False) + finally: + # At this point the node can still be open if there is still some + # alive reference around (e.g. if the __del__ method is called + # explicitly by the user). + if self._v_isopen: + self._v__deleting = True + self._f_close() + + def _g_pre_kill_hook(self): + """Code to be called before killing the node.""" + pass + + def _g_create(self): + """Create a new HDF5 node and return its object identifier.""" + raise NotImplementedError + + def _g_open(self): + """Open an existing HDF5 node and return its object identifier.""" + raise NotImplementedError + + def _g_check_open(self): + """Check that the node is open. + + If the node is closed, a `ClosedNodeError` is raised. + + """ + + if not self._v_isopen: + raise ClosedNodeError("the node object is closed") + assert self._v_file.isopen, "found an open node in a closed file" + + def _g_set_location(self, parentnode, name): + """Set location-dependent attributes. + + Sets the location-dependent attributes of this node to reflect + that it is placed under the specified `parentnode`, with the + specified `name`. + + This also triggers the insertion of file references to this + node. If the maximum recommended tree depth is exceeded, a + `PerformanceWarning` is issued. + + """ + + file_ = parentnode._v_file + parentdepth = parentnode._v_depth + + self._v_file = file_ + self._v_isopen = True + + root_uep = file_.root_uep + if name.startswith(root_uep): + # This has been called from File._get_node() + assert parentdepth == 0 + if root_uep == "/": + self._v_pathname = name + else: + self._v_pathname = name[len(root_uep):] + _, self._v_name = split_path(name) + self._v_depth = name.count("/") - root_uep.count("/") + 1 + else: + # If we enter here is because this has been called elsewhere + self._v_name = name + self._v_pathname = join_path(parentnode._v_pathname, name) + self._v_depth = parentdepth + 1 + + # Check if the node is too deep in the tree. + if parentdepth >= self._v_maxtreedepth: + warnings.warn("""\ +node ``%s`` is exceeding the recommended maximum depth (%d);\ +be ready to see PyTables asking for *lots* of memory and possibly slow I/O""" + % (self._v_pathname, self._v_maxtreedepth), + PerformanceWarning) + + if self._v_pathname != '/': + file_._node_manager.cache_node(self, self._v_pathname) + + def _g_update_location(self, newparentpath): + """Update location-dependent attributes. + + Updates location data when an ancestor node has changed its + location in the hierarchy to `newparentpath`. In fact, this + method is expected to be called by an ancestor of this node. + + This also triggers the update of file references to this node. + If the maximum recommended node depth is exceeded, a + `PerformanceWarning` is issued. This warning is assured to be + unique. + + """ + + oldpath = self._v_pathname + newpath = join_path(newparentpath, self._v_name) + newdepth = newpath.count('/') + + self._v_pathname = newpath + self._v_depth = newdepth + + # Check if the node is too deep in the tree. + if newdepth > self._v_maxtreedepth: + warnings.warn("""\ +moved descendent node is exceeding the recommended maximum depth (%d);\ +be ready to see PyTables asking for *lots* of memory and possibly slow I/O""" + % (self._v_maxtreedepth,), PerformanceWarning) + + node_manager = self._v_file._node_manager + node_manager.rename_node(oldpath, newpath) + + # Tell dependent objects about the new location of this node. + self._g_update_dependent() + + def _g_del_location(self): + """Clear location-dependent attributes. + + This also triggers the removal of file references to this node. + + """ + + node_manager = self._v_file._node_manager + pathname = self._v_pathname + + if not self._v__deleting: + node_manager.drop_from_cache(pathname) + # Note: node_manager.drop_node do not removes the node form the + # registry if it is still open + node_manager.registry.pop(pathname, None) + + self._v_file = None + self._v_isopen = False + self._v_pathname = None + self._v_name = None + self._v_depth = None + + def _g_post_init_hook(self): + """Code to be run after node creation and before creation logging.""" + pass + + def _g_update_dependent(self): + """Update dependent objects after a location change. + + All dependent objects (but not nodes!) referencing this node + must be updated here. + + """ + + if '_v_attrs' in self.__dict__: + self._v_attrs._g_update_node_location(self) + + def _f_close(self): + """Close this node in the tree. + + This releases all resources held by the node, so it should not + be used again. On nodes with data, it may be flushed to disk. + + You should not need to close nodes manually because they are + automatically opened/closed when they are loaded/evicted from + the integrated LRU cache. + + """ + + # After calling ``_f_close()``, two conditions are met: + # + # 1. The node object is detached from the tree. + # 2. *Every* attribute of the node is removed. + # + # Thus, cleanup operations used in ``_f_close()`` in sub-classes + # must be run *before* calling the method in the superclass. + + if not self._v_isopen: + return # the node is already closed + + myDict = self.__dict__ + + # Close the associated `AttributeSet` + # only if it has already been placed in the object's dictionary. + if '_v_attrs' in myDict: + self._v_attrs._g_close() + + # Detach the node from the tree if necessary. + self._g_del_location() + + # Finally, clear all remaining attributes from the object. + myDict.clear() + + # Just add a final flag to signal that the node is closed: + self._v_isopen = False + + def _g_remove(self, recursive, force): + """Remove this node from the hierarchy. + + If the node has children, recursive removal must be stated by + giving `recursive` a true value; otherwise, a `NodeError` will + be raised. + + If `force` is set to true, the node will be removed no matter it + has children or not (useful for deleting hard links). + + It does not log the change. + + """ + + # Remove the node from the PyTables hierarchy. + parent = self._v_parent + parent._g_unrefnode(self._v_name) + # Close the node itself. + self._f_close() + # hdf5extension operations: + # Remove the node from the HDF5 hierarchy. + self._g_delete(parent) + + def _f_remove(self, recursive=False, force=False): + """Remove this node from the hierarchy. + + If the node has children, recursive removal must be stated by giving + recursive a true value; otherwise, a NodeError will be raised. + + If the node is a link to a Group object, and you are sure that you want + to delete it, you can do this by setting the force flag to true. + + """ + + self._g_check_open() + file_ = self._v_file + file_._check_writable() + + if file_.is_undo_enabled(): + self._g_remove_and_log(recursive, force) + else: + self._g_remove(recursive, force) + + def _g_remove_and_log(self, recursive, force): + file_ = self._v_file + oldpathname = self._v_pathname + # Log *before* moving to use the right shadow name. + file_._log('REMOVE', oldpathname) + move_to_shadow(file_, oldpathname) + + def _g_move(self, newparent, newname): + """Move this node in the hierarchy. + + Moves the node into the given `newparent`, with the given + `newname`. + + It does not log the change. + + """ + + oldparent = self._v_parent + oldname = self._v_name + oldpathname = self._v_pathname # to move the HDF5 node + + # Try to insert the node into the new parent. + newparent._g_refnode(self, newname) + # Remove the node from the new parent. + oldparent._g_unrefnode(oldname) + + # Remove location information for this node. + self._g_del_location() + # Set new location information for this node. + self._g_set_location(newparent, newname) + + # hdf5extension operations: + # Update node attributes. + self._g_new(newparent, self._v_name, init=False) + # Move the node. + # self._v_parent._g_move_node(oldpathname, self._v_pathname) + self._v_parent._g_move_node(oldparent._v_objectid, oldname, + newparent._v_objectid, newname, + oldpathname, self._v_pathname) + + # Tell dependent objects about the new location of this node. + self._g_update_dependent() + + def _f_rename(self, newname, overwrite=False): + """Rename this node in place. + + Changes the name of a node to *newname* (a string). If a node with the + same newname already exists and overwrite is true, recursively remove + it before renaming. + + """ + + self._f_move(newname=newname, overwrite=overwrite) + + def _f_move(self, newparent=None, newname=None, + overwrite=False, createparents=False): + """Move or rename this node. + + Moves a node into a new parent group, or changes the name of the + node. newparent can be a Group object (see :ref:`GroupClassDescr`) or a + pathname in string form. If it is not specified or None, the current + parent group is chosen as the new parent. newname must be a string + with a new name. If it is not specified or None, the current name is + chosen as the new name. If createparents is true, the needed groups for + the given new parent group path to exist will be created. + + Moving a node across databases is not allowed, nor it is moving a node + *into* itself. These result in a NodeError. However, moving a node + *over* itself is allowed and simply does nothing. Moving over another + existing node is similarly not allowed, unless the optional overwrite + argument is true, in which case that node is recursively removed before + moving. + + Usually, only the first argument will be used, effectively moving the + node to a new location without changing its name. Using only the + second argument is equivalent to renaming the node in place. + + """ + + self._g_check_open() + file_ = self._v_file + oldparent = self._v_parent + oldname = self._v_name + + # Set default arguments. + if newparent is None and newname is None: + raise NodeError("you should specify at least " + "a ``newparent`` or a ``newname`` parameter") + if newparent is None: + newparent = oldparent + if newname is None: + newname = oldname + + # Get destination location. + if hasattr(newparent, '_v_file'): # from node + newfile = newparent._v_file + newpath = newparent._v_pathname + elif hasattr(newparent, 'startswith'): # from path + newfile = file_ + newpath = newparent + else: + raise TypeError("new parent is not a node nor a path: %r" + % (newparent,)) + + # Validity checks on arguments. + # Is it in the same file? + if newfile is not file_: + raise NodeError("nodes can not be moved across databases; " + "please make a copy of the node") + + # The movement always fails if the hosting file can not be modified. + file_._check_writable() + + # Moving over itself? + oldpath = oldparent._v_pathname + if newpath == oldpath and newname == oldname: + # This is equivalent to renaming the node to its current name, + # and it does not change the referenced object, + # so it is an allowed no-op. + return + + # Moving into itself? + self._g_check_not_contains(newpath) + + # Note that the previous checks allow us to go ahead and create + # the parent groups if `createparents` is true. `newparent` is + # used instead of `newpath` to avoid accepting `Node` objects + # when `createparents` is true. + newparent = file_._get_or_create_path(newparent, createparents) + self._g_check_group(newparent) # Is it a group? + + # Moving over an existing node? + self._g_maybe_remove(newparent, newname, overwrite) + + # Move the node. + oldpathname = self._v_pathname + self._g_move(newparent, newname) + + # Log the change. + if file_.is_undo_enabled(): + self._g_log_move(oldpathname) + + def _g_log_move(self, oldpathname): + self._v_file._log('MOVE', oldpathname, self._v_pathname) + + def _g_copy(self, newparent, newname, recursive, _log=True, **kwargs): + """Copy this node and return the new one. + + Creates and returns a copy of the node in the given `newparent`, + with the given `newname`. If `recursive` copy is stated, all + descendents are copied as well. Additional keyword argumens may + affect the way that the copy is made. Unknown arguments must be + ignored. On recursive copies, all keyword arguments must be + passed on to the children invocation of this method. + + If `_log` is false, the change is not logged. This is *only* + intended to be used by ``_g_copy_as_child()`` as a means of + optimising sub-tree copies. + + """ + + raise NotImplementedError + + def _g_copy_as_child(self, newparent, **kwargs): + """Copy this node as a child of another group. + + Copies just this node into `newparent`, not recursing children + nor overwriting nodes nor logging the copy. This is intended to + be used when copying whole sub-trees. + + """ + + return self._g_copy(newparent, self._v_name, + recursive=False, _log=False, **kwargs) + + def _f_copy(self, newparent=None, newname=None, + overwrite=False, recursive=False, createparents=False, + **kwargs): + """Copy this node and return the new node. + + Creates and returns a copy of the node, maybe in a different place in + the hierarchy. newparent can be a Group object (see + :ref:`GroupClassDescr`) or a pathname in string form. If it is not + specified or None, the current parent group is chosen as the new + parent. newname must be a string with a new name. If it is not + specified or None, the current name is chosen as the new name. If + recursive copy is stated, all descendants are copied as well. If + createparents is true, the needed groups for the given new parent group + path to exist will be created. + + Copying a node across databases is supported but can not be + undone. Copying a node over itself is not allowed, nor it is + recursively copying a node into itself. These result in a + NodeError. Copying over another existing node is similarly not allowed, + unless the optional overwrite argument is true, in which case that node + is recursively removed before copying. + + Additional keyword arguments may be passed to customize the copying + process. For instance, title and filters may be changed, user + attributes may be or may not be copied, data may be sub-sampled, stats + may be collected, etc. See the documentation for the particular node + type. + + Using only the first argument is equivalent to copying the node to a + new location without changing its name. Using only the second argument + is equivalent to making a copy of the node in the same group. + + """ + + self._g_check_open() + srcfile = self._v_file + srcparent = self._v_parent + srcname = self._v_name + + dstparent = newparent + dstname = newname + + # Set default arguments. + if dstparent is None and dstname is None: + raise NodeError("you should specify at least " + "a ``newparent`` or a ``newname`` parameter") + if dstparent is None: + dstparent = srcparent + if dstname is None: + dstname = srcname + + # Get destination location. + if hasattr(dstparent, '_v_file'): # from node + dstfile = dstparent._v_file + dstpath = dstparent._v_pathname + elif hasattr(dstparent, 'startswith'): # from path + dstfile = srcfile + dstpath = dstparent + else: + raise TypeError("new parent is not a node nor a path: %r" + % (dstparent,)) + + # Validity checks on arguments. + if dstfile is srcfile: + # Copying over itself? + srcpath = srcparent._v_pathname + if dstpath == srcpath and dstname == srcname: + raise NodeError( + "source and destination nodes are the same node: ``%s``" + % self._v_pathname) + + # Recursively copying into itself? + if recursive: + self._g_check_not_contains(dstpath) + + # Note that the previous checks allow us to go ahead and create + # the parent groups if `createparents` is true. `dstParent` is + # used instead of `dstPath` because it may be in other file, and + # to avoid accepting `Node` objects when `createparents` is + # true. + dstparent = srcfile._get_or_create_path(dstparent, createparents) + self._g_check_group(dstparent) # Is it a group? + + # Copying to another file with undo enabled? + if dstfile is not srcfile and srcfile.is_undo_enabled(): + warnings.warn("copying across databases can not be undone " + "nor redone from this database", + UndoRedoWarning) + + # Copying over an existing node? + self._g_maybe_remove(dstparent, dstname, overwrite) + + # Copy the node. + # The constructor of the new node takes care of logging. + return self._g_copy(dstparent, dstname, recursive, **kwargs) + + def _f_isvisible(self): + """Is this node visible?""" + + self._g_check_open() + return isvisiblepath(self._v_pathname) + + def _g_check_group(self, node): + # Node must be defined in order to define a Group. + # However, we need to know Group here. + # Using class_name_dict avoids a circular import. + if not isinstance(node, class_name_dict['Node']): + raise TypeError("new parent is not a registered node: %s" + % node._v_pathname) + if not isinstance(node, class_name_dict['Group']): + raise TypeError("new parent node ``%s`` is not a group" + % node._v_pathname) + + def _g_check_not_contains(self, pathname): + # The not-a-TARDIS test. ;) + mypathname = self._v_pathname + if (mypathname == '/' # all nodes fall below the root group + or pathname == mypathname + or pathname.startswith(mypathname + '/')): + raise NodeError("can not move or recursively copy node ``%s`` " + "into itself" % mypathname) + + def _g_maybe_remove(self, parent, name, overwrite): + if name in parent: + if not overwrite: + raise NodeError( + f"destination group ``{parent._v_pathname}`` already " + f"has a node named ``{name}``; you may want to use the " + f"``overwrite`` argument") + parent._f_get_child(name)._f_remove(True) + + def _g_check_name(self, name): + """Check validity of name for this particular kind of node. + + This is invoked once the standard HDF5 and natural naming checks + have successfully passed. + + """ + + if name.startswith('_i_'): + # This is reserved for table index groups. + raise ValueError( + "node name starts with reserved prefix ``_i_``: %s" % name) + + def _f_getattr(self, name): + """Get a PyTables attribute from this node. + + If the named attribute does not exist, an AttributeError is + raised. + + """ + + return getattr(self._v_attrs, name) + + def _f_setattr(self, name, value): + """Set a PyTables attribute for this node. + + If the node already has a large number of attributes, a + PerformanceWarning is issued. + + """ + + setattr(self._v_attrs, name, value) + + def _f_delattr(self, name): + """Delete a PyTables attribute from this node. + + If the named attribute does not exist, an AttributeError is + raised. + + """ + + delattr(self._v_attrs, name) + + # + + +class NotLoggedMixin: + # Include this class in your inheritance tree + # to avoid changes to instances of your class from being logged. + + _AttributeSet = NotLoggedAttributeSet + + def _g_log_create(self): + pass + + def _g_log_move(self, oldpathname): + pass + + def _g_remove_and_log(self, recursive, force): + self._g_remove(recursive, force) diff --git a/tables/nodes/__init__.py b/tables/nodes/__init__.py new file mode 100644 index 0000000..974cc46 --- /dev/null +++ b/tables/nodes/__init__.py @@ -0,0 +1,14 @@ +"""Special node behaviours for PyTables. + +This package contains several modules that give specific behaviours +to PyTables nodes. For instance, the filenode module provides +a file interface to a PyTables node. + + +Package modules: + filenode -- A file interface to nodes for PyTables databases. + +""" + +# The list of names to be exported to the importing module. +__all__ = ['filenode'] diff --git a/tables/nodes/filenode.py b/tables/nodes/filenode.py new file mode 100644 index 0000000..2e58463 --- /dev/null +++ b/tables/nodes/filenode.py @@ -0,0 +1,858 @@ +"""A file interface to nodes for PyTables databases. + +The FileNode module provides a file interface for using inside of +PyTables database files. Use the new_node() function to create a brand +new file node which can be read and written as any ordinary Python +file. Use the open_node() function to open an existing (i.e. created +with new_node()) node for read-only or read-write access. Read acces +is always available. Write access (enabled on new files and files +opened with mode 'a+') only allows appending data to a file node. + +Currently only binary I/O is supported. + +See :ref:`filenode_usersguide` for instructions on use. + +.. versionchanged:: 3.0 + In version 3.0 the module as been completely rewritten to be fully + compliant with the interfaces defined in the :mod:`io` module. + +""" + +import io +import os +import re +import warnings +from pathlib import Path + +import numpy as np + +import tables as tb + + +NodeType = 'file' +"""Value for NODE_TYPE node system attribute.""" + +NodeTypeVersions = [1, 2] +"""Supported values for NODE_TYPE_VERSION node system attribute.""" + + +class RawPyTablesIO(io.RawIOBase): + """Base class for raw binary I/O on HDF5 files using PyTables.""" + + # A lambda to turn a size into a shape, for each version. + _size_to_shape = [ + None, + lambda l: (l, 1), + lambda l: (l, ), + ] + + def __init__(self, node, mode=None): + super().__init__() + + self._check_node(node) + self._check_attributes(node) + + if mode is None: + mode = node._v_file.mode + else: + self._check_mode(mode) + self._cross_check_mode(mode, node._v_file.mode) + + self._node = node + self._mode = mode + self._pos = 0 + self._version = int(node.attrs.NODE_TYPE_VERSION) + self._vshape = self._size_to_shape[self._version] + self._vtype = node.atom.dtype.base.type + + # read only attribute + @property + def mode(self): + """File mode.""" + + return self._mode + + # def tell(self) -> int: + def tell(self): + """Return current stream position.""" + + self._checkClosed() + return self._pos + + # def seek(self, pos: int, whence: int = 0) -> int: + def seek(self, pos, whence=0): + """Change stream position. + + Change the stream position to byte offset offset. offset is + interpreted relative to the position indicated by whence. Values + for whence are: + + * 0 -- start of stream (the default); offset should be zero or positive + * 1 -- current stream position; offset may be negative + * 2 -- end of stream; offset is usually negative + + Return the new absolute position. + + """ + + self._checkClosed() + try: + pos = pos.__index__() + # except AttributeError as err: + # raise TypeError("an integer is required") from err + except AttributeError: + raise TypeError("an integer is required") + if whence == 0: + if pos < 0: + raise ValueError(f"negative seek position {pos!r}") + self._pos = pos + elif whence == 1: + self._pos = max(0, self._pos + pos) + elif whence == 2: + self._pos = max(0, self._node.nrows + pos) + else: + raise ValueError("invalid whence value") + return self._pos + + # def seekable(self) -> bool: + def seekable(self): + """Return whether object supports random access. + + If False, seek(), tell() and truncate() will raise IOError. This + method may need to do a test seek(). + + """ + + return True + + # def fileno(self) -> int: + def fileno(self): + """Returns underlying file descriptor if one exists. + + An IOError is raised if the IO object does not use a file + descriptor. + + """ + + self._checkClosed() + return self._node._v_file.fileno() + + # def close(self) -> None: + def close(self): + """Flush and close the IO object. + + This method has no effect if the file is already closed. + + """ + + if not self.closed: + if getattr(self._node, '_v_file', None) is None: + warnings.warn("host PyTables file is already closed!") + + try: + super().close() + finally: + # Release node object to allow closing the file. + self._node = None + + def flush(self): + """Flush write buffers, if applicable. + + This is not implemented for read-only and non-blocking streams. + + """ + + self._checkClosed() + self._node.flush() + + # def truncate(self, pos: int = None) -> int: + def truncate(self, pos=None): + """Truncate file to size bytes. + + Size defaults to the current IO position as reported by tell(). + Return the new size. + + Currently, this method only makes sense to grow the file node, + since data can not be rewritten nor deleted. + + """ + + self._checkClosed() + self._checkWritable() + + if pos is None: + pos = self._pos + elif pos < 0: + raise ValueError(f"negative truncate position {pos!r}") + + if pos < self._node.nrows: + raise OSError("truncating is only allowed for growing a file") + self._append_zeros(pos - self._node.nrows) + + return self.seek(pos) + + # def readable(self) -> bool: + def readable(self): + """Return whether object was opened for reading. + + If False, read() will raise IOError. + + """ + + mode = self._mode + return 'r' in mode or '+' in mode + + # def writable(self) -> bool: + def writable(self): + """Return whether object was opened for writing. + + If False, write() and truncate() will raise IOError. + + """ + + mode = self._mode + return 'w' in mode or 'a' in mode or '+' in mode + + # def readinto(self, b: bytearray) -> int: + def readinto(self, b): + """Read up to len(b) bytes into b. + + Returns number of bytes read (0 for EOF), or None if the object + is set not to block as has no data to read. + + """ + + self._checkClosed() + self._checkReadable() + + if self._pos >= self._node.nrows: + return 0 + + n = len(b) + start = self._pos + stop = self._pos + n + + # XXX optimized path + # if stop <= self._node.nrows and isinstance(b, np.ndarray): + # self._node.read(start, stop, out=b) + # self._pos += n + # return n + + if stop > self._node.nrows: + stop = self._node.nrows + n = stop - start + + # XXX This ought to work with anything that supports the buffer API + b[:n] = self._node.read(start, stop).tobytes() + + self._pos += n + + return n + + # def readline(self, limit: int = -1) -> bytes: + def readline(self, limit=-1): + """Read and return a line from the stream. + + If limit is specified, at most limit bytes will be read. + + The line terminator is always ``\\n`` for binary files; for text + files, the newlines argument to open can be used to select the line + terminator(s) recognized. + + """ + + self._checkClosed() + self._checkReadable() + + chunksize = self._node.chunkshape[0] if self._node.chunkshape else -1 + + # XXX: check + lsep = b'\n' + lseplen = len(lsep) + + # Set the remaining bytes to read to the specified size. + remsize = limit + + partial = [] + finished = False + + while not finished: + # Read a string limited by the remaining number of bytes. + if limit <= 0: + ibuff = self.read(chunksize) + else: + ibuff = self.read(min(remsize, chunksize)) + ibufflen = len(ibuff) + remsize -= ibufflen + + if ibufflen >= lseplen: + # Separator fits, look for EOL string. + eolindex = ibuff.find(lsep) + elif ibufflen == 0: + # EOF was immediately reached. + finished = True + continue + else: # ibufflen < lseplen + # EOF was hit and separator does not fit. ;) + partial.append(ibuff) + finished = True + continue + + if eolindex >= 0: + # Found an EOL. If there are trailing characters, + # cut the input buffer and seek back; + # else add the whole input buffer. + trailing = ibufflen - lseplen - eolindex # Bytes beyond EOL. + if trailing > 0: + obuff = ibuff[:-trailing] + self.seek(-trailing, 1) + remsize += trailing + else: + obuff = ibuff + finished = True + elif lseplen > 1 and (limit <= 0 or remsize > 0): + # Seek back a little since the end of the read string + # may have fallen in the middle of the line separator. + obuff = ibuff[:-lseplen + 1] + self.seek(-lseplen + 1, 1) + remsize += lseplen - 1 + else: # eolindex<0 and (lseplen<=1 or (limit>0 and remsize<=0)) + # Did not find an EOL, add the whole input buffer. + obuff = ibuff + + # Append (maybe cut) buffer. + partial.append(obuff) + + # If a limit has been specified and the remaining count + # reaches zero, the reading is finished. + if limit > 0 and remsize <= 0: + finished = True + + return b''.join(partial) + + # def write(self, b: bytes) -> int: + def write(self, b): + """Write the given buffer to the IO stream. + + Returns the number of bytes written, which may be less than + len(b). + + """ + + self._checkClosed() + self._checkWritable() + + if isinstance(b, str): + raise TypeError("can't write str to binary stream") + + n = len(b) + if n == 0: + return 0 + + pos = self._pos + + # Is the pointer beyond the real end of data? + end2off = pos - self._node.nrows + if end2off > 0: + # Zero-fill the gap between the end of data and the pointer. + self._append_zeros(end2off) + + # Append data. + self._node.append( + np.ndarray(buffer=b, dtype=self._vtype, shape=self._vshape(n))) + + self._pos += n + + return n + + def _checkClosed(self): + """Checks if file node is open. + + Checks whether the file node is open or has been closed. In the + second case, a ValueError is raised. If the host PyTables has + been closed, ValueError is also raised. + + """ + + super()._checkClosed() + if getattr(self._node, '_v_file', None) is None: + raise ValueError("host PyTables file is already closed!") + + def _check_node(self, node): + if not isinstance(node, tb.EArray): + raise TypeError('the "node" parameter should be a tables.EArray') + if not isinstance(node.atom, tb.UInt8Atom): + raise TypeError('only nodes with atom "UInt8Atom" are allowed') + + def _check_mode(self, mode): + if not isinstance(mode, str): + raise TypeError("invalid mode: %r" % mode) + + modes = set(mode) + if modes - set("arwb+tU") or len(mode) > len(modes): + raise ValueError("invalid mode: %r" % mode) + + reading = "r" in modes + writing = "w" in modes + appending = "a" in modes + # updating = "+" in modes + text = "t" in modes + binary = "b" in modes + + if "U" in modes: + if writing or appending: + raise ValueError("can't use U and writing mode at once") + reading = True + + if text and binary: + raise ValueError("can't have text and binary mode at once") + + if reading + writing + appending > 1: + raise ValueError("can't have read/write/append mode at once") + + if not (reading or writing or appending): + raise ValueError("must have exactly one of read/write/append mode") + + def _cross_check_mode(self, mode, h5filemode): + # XXX: check + # readable = bool('r' in mode or '+' in mode) + # h5readable = bool('r' in h5filemode or '+' in h5filemode) + # + # if readable and not h5readable: + # raise ValueError("RawPyTablesIO can't be open in read mode if " + # "the underlying hdf5 file is not readable") + + writable = bool('w' in mode or 'a' in mode or '+' in mode) + h5writable = bool('w' in h5filemode or 'a' in h5filemode or + '+' in h5filemode) + + if writable and not h5writable: + raise ValueError("RawPyTablesIO can't be open in write mode if " + "the underlying hdf5 file is not writable") + + def _check_attributes(self, node): + """Checks file node-specific attributes. + + Checks for the presence and validity + of the system attributes 'NODE_TYPE' and 'NODE_TYPE_VERSION' + in the specified PyTables node (leaf). + ValueError is raised if an attribute is missing or incorrect. + + """ + + attrs = node.attrs + ltype = getattr(attrs, 'NODE_TYPE', None) + ltypever = getattr(attrs, 'NODE_TYPE_VERSION', None) + + if ltype != NodeType: + raise ValueError(f"invalid type of node object: {ltype}") + if ltypever not in NodeTypeVersions: + raise ValueError( + f"unsupported type version of node object: {ltypever}") + + def _append_zeros(self, size): + """_append_zeros(size) -> None. Appends a string of zeros. + + Appends a string of 'size' zeros to the array, + without moving the file pointer. + + """ + + # Appending an empty array would raise an error. + if size == 0: + return + + # XXX This may be redone to avoid a potentially large in-memory array. + self._node.append( + np.zeros(dtype=self._vtype, shape=self._vshape(size))) + + +class FileNodeMixin: + """Mixin class for FileNode objects. + + It provides access to the attribute set of the node that becomes + available via the attrs property. You can add attributes there, but + try to avoid attribute names in all caps or starting with '_', since + they may clash with internal attributes. + + """ + + # The attribute set property methods. + def _get_attrs(self): + """Returns the attribute set of the file node.""" + + # sefl._checkClosed() + return self._node.attrs + + def _set_attrs(self, value): + """set_attrs(string) -> None. Raises ValueError.""" + + raise ValueError("changing the whole attribute set is not allowed") + + def _del_attrs(self): + """del_attrs() -> None. Raises ValueError.""" + + raise ValueError("deleting the whole attribute set is not allowed") + + # The attribute set property. + attrs = property( + _get_attrs, _set_attrs, _del_attrs, + "A property pointing to the attribute set of the file node.") + + +class ROFileNode(FileNodeMixin, RawPyTablesIO): + """Creates a new read-only file node. + + Creates a new read-only file node associated with the specified + PyTables node, providing a standard Python file interface to it. + The node has to have been created on a previous occasion + using the new_node() function. + + The node used as storage is also made available via the read-only + attribute node. Please do not tamper with this object if it's + avoidable, since you may break the operation of the file node object. + + The constructor is not intended to be used directly. + Use the open_node() function in read-only mode ('r') instead. + + :Version 1: + implements the file storage as a UInt8 uni-dimensional EArray. + :Version 2: + uses an UInt8 N vector EArray. + + .. versionchanged:: 3.0 + The offset attribute is no more available, please use seek/tell + methods instead. + + .. versionchanged:: 3.0 + The line_separator property is no more available. + The only line separator used for binary I/O is ``\\n``. + + """ + + def __init__(self, node): + RawPyTablesIO.__init__(self, node, 'r') + self._checkReadable() + + @property + def node(self): + return self._node + + +class RAFileNode(FileNodeMixin, RawPyTablesIO): + """Creates a new read-write file node. + + The first syntax opens the specified PyTables node, while the + second one creates a new node in the specified PyTables file. + In the second case, additional named arguments 'where' and 'name' + must be passed to specify where the file node is to be created. + Other named arguments such as 'title' and 'filters' may also be + passed. The special named argument 'expectedsize', indicating an + estimate of the file size in bytes, may also be passed. + + Write access means reading as well as appending data is allowed. + + The node used as storage is also made available via the read-only + attribute node. Please do not tamper with this object if it's + avoidable, since you may break the operation of the file node object. + + The constructor is not intended to be used directly. + Use the new_node() or open_node() functions instead. + + :Version 1: + implements the file storage as a UInt8 uni-dimensional EArray. + :Version 2: + uses an UInt8 N vector EArray. + + .. versionchanged:: 3.0 + The offset attribute is no more available, please use seek/tell + methods instead. + + .. versionchanged:: 3.0 + The line_separator property is no more available. + The only line separator used for binary I/O is ``\\n``. + + """ + + # The atom representing a byte in the array, for each version. + _byte_shape = [ + None, + (0, 1), + (0,), + ] + + __allowed_init_kwargs = [ + 'where', 'name', 'title', 'filters', 'expectedsize'] + + def __init__(self, node, h5file, **kwargs): + if node is not None: + # Open an existing node and get its version. + self._check_attributes(node) + self._version = node.attrs.NODE_TYPE_VERSION + elif h5file is not None: + # Check for allowed keyword arguments, + # to avoid unwanted arguments falling through to array constructor. + for kwarg in kwargs: + if kwarg not in self.__allowed_init_kwargs: + raise TypeError( + "%s keyword argument is not allowed" % repr(kwarg)) + + # Turn 'expectedsize' into 'expectedrows'. + if 'expectedsize' in kwargs: + # These match since one byte is stored per row. + expectedrows = kwargs['expectedsize'] + kwargs = kwargs.copy() + del kwargs['expectedsize'] + kwargs['expectedrows'] = expectedrows + + # Create a new array in the specified PyTables file. + self._version = NodeTypeVersions[-1] + shape = self._byte_shape[self._version] + node = h5file.create_earray( + atom=tb.UInt8Atom(), shape=shape, **kwargs) + + # Set the node attributes, else remove the array itself. + try: + self._set_attributes(node) + except RuntimeError: + h5file.remove_node(kwargs['where'], kwargs['name']) + raise + + RawPyTablesIO.__init__(self, node, 'a+') + self._checkReadable() + self._checkWritable() + + @property + def node(self): + return self._node + + def _set_attributes(self, node): + """_set_attributes(node) -> None. Adds file node-specific attributes. + + Sets the system attributes 'NODE_TYPE' and 'NODE_TYPE_VERSION' + in the specified PyTables node (leaf). + + """ + + attrs = node.attrs + attrs.NODE_TYPE = NodeType + attrs.NODE_TYPE_VERSION = NodeTypeVersions[-1] + + +def new_node(h5file, **kwargs): + """Creates a new file node object in the specified PyTables file object. + + Additional named arguments where and name must be passed to specify where + the file node is to be created. Other named arguments such as title and + filters may also be passed. + + The special named argument expectedsize, indicating an estimate of the + file size in bytes, may also be passed. It returns the file node object. + + """ + + return RAFileNode(None, h5file, **kwargs) + + +def open_node(node, mode='r'): + """Opens an existing file node. + + Returns a file node object from the existing specified PyTables + node. If mode is not specified or it is 'r', the file can only be + read, and the pointer is positioned at the beginning of the file. If + mode is 'a+', the file can be read and appended, and the pointer is + positioned at the end of the file. + + """ + + if mode == 'r': + return ROFileNode(node) + elif mode == 'a+': + return RAFileNode(node, None) + else: + raise OSError(f"invalid mode: {mode}") + + +def save_to_filenode(h5file, filename, where, name=None, overwrite=False, + title="", filters=None): + """Save a file's contents to a filenode inside a PyTables file. + + .. versionadded:: 3.2 + + Parameters + ---------- + h5file + The PyTables file to be written to; can be either a string + giving the file's location or a :class:`File` object. If a file + with name *h5file* already exists, it will be opened in + mode ``a``. + + filename + Path of the file which shall be stored within the PyTables file. + + where, name + Location of the filenode where the data shall be stored. If + *name* is not given, and *where* is either a :class:`Group` + object or a string ending on ``/``, the leaf name will be set to + the file name of *filename*. The *name* will be modified to + adhere to Python's natural naming convention; the original + filename will be preserved in the filenode's *_filename* + attribute. + + overwrite + Whether or not a possibly existing filenode of the specified + name shall be overwritten. + + title + A description for this node (it sets the ``TITLE`` HDF5 + attribute on disk). + + filters + An instance of the :class:`Filters` class that provides + information about the desired I/O filters to be applied + during the life of this object. + + """ + path = Path(filename).resolve() + + # sanity checks + if not os.access(path, os.R_OK): + raise OSError(f"The file '{path}' could not be read") + if isinstance(h5file, tb.file.File) and h5file.mode == "r": + raise OSError(f"The file '{h5file.filename}' is opened read-only") + + # guess filenode's name if necessary + if name is None: + if isinstance(where, tb.group.Group): + name = os.path.split(filename)[1] + if isinstance(where, str): + if where.endswith("/"): + name = os.path.split(filename)[1] + else: + nodepath = where.split("/") + where = "/" + "/".join(nodepath[:-1]) + name = nodepath[-1] + + # sanitize name if necessary + if not tb.path._python_id_re.match(name): + name = re.sub('(?![a-zA-Z0-9_]).', "_", + re.sub('^(?![a-zA-Z_]).', "_", name)) + + new_h5file = not isinstance(h5file, tb.file.File) + f = tb.File(h5file, "a") if new_h5file else h5file + + # check for already existing filenode + try: + f.get_node(where=where, name=name) + if not overwrite: + if new_h5file: + f.close() + raise OSError( + f"Specified node already exists in file '{f.filename}'" + ) + except tb.NoSuchNodeError: + pass + + # read data from disk + data = path.read_bytes() + + # remove existing filenode if present + try: + f.remove_node(where=where, name=name) + except tb.NoSuchNodeError: + pass + + # write file's contents to filenode + fnode = new_node(f, where=where, name=name, title=title, filters=filters) + fnode.write(data) + fnode.attrs._filename = path.name + fnode.close() + + # cleanup + if new_h5file: + f.close() + + +def read_from_filenode(h5file, filename, where, name=None, overwrite=False, + create_target=False): + r"""Read a filenode from a PyTables file and write its contents to a file. + + .. versionadded:: 3.2 + + Parameters + ---------- + h5file + The PyTables file to be read from; can be either a string + giving the file's location or a :class:`File` object. + + filename + Path of the file where the contents of the filenode shall be + written to. If *filename* points to a directory or ends with + ``/`` (``\`` on Windows), the filename will be set to the + *_filename* (if present; otherwise the *name*) attribute of the + read filenode. + + where, name + Location of the filenode where the data shall be read from. If + no node *name* can be found at *where*, the first node at + *where* whose *_filename* attribute matches *name* will be read. + + overwrite + Whether or not a possibly existing file of the specified + *filename* shall be overwritten. + + create_target + Whether or not the folder hierarchy needed to accomodate the + given target ``filename`` will be created. + + """ + path = Path(filename).resolve() + + new_h5file = not isinstance(h5file, tb.file.File) + f = tb.File(h5file, "r") if new_h5file else h5file + try: + fnode = open_node(f.get_node(where=where, name=name)) + except tb.NoSuchNodeError: + fnode = None + for n in f.walk_nodes(where=where, classname="EArray"): + if n.attrs._filename == name: + fnode = open_node(n) + break + if fnode is None: + f.close() + raise tb.NoSuchNodeError("A filenode '%s' cannot be found at " + "'%s'" % (name, where)) + + # guess output filename if necessary + # TODO: pathlib.Path strips trailing slash automatically :-( + if path.is_dir() or filename.endswith(os.path.sep): + try: + path = path / fnode.node.attrs._filename + except Exception: + path = path / fnode.node.name + + if os.access(path, os.R_OK) and not overwrite: + if new_h5file: + f.close() + raise OSError(f"The file '{path}' already exists") + + # create folder hierarchy if necessary + if create_target: + path.parent.mkdir(parents=True, exist_ok=True) + + if not os.access(path.parent, os.W_OK): + if new_h5file: + f.close() + raise OSError("The file '%s' cannot be written to" % filename) + + # read data from filenode + data = fnode.read() + fnode.close() + + # store data to file + path.write_bytes(data) + + # cleanup + del data + if new_h5file: + f.close() diff --git a/tables/nodes/tests/__init__.py b/tables/nodes/tests/__init__.py new file mode 100644 index 0000000..072ade1 --- /dev/null +++ b/tables/nodes/tests/__init__.py @@ -0,0 +1 @@ +"""Unit tests for special node behaviours.""" diff --git a/tables/nodes/tests/test_filenode.dat b/tables/nodes/tests/test_filenode.dat new file mode 100644 index 0000000..a25f84e --- /dev/null +++ b/tables/nodes/tests/test_filenode.dat @@ -0,0 +1,52 @@ +#define test_width 64 +#define test_height 64 +static char test_bits[] = { + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xF3, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xF1, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x7F, 0xC0, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0x7F, 0xC4, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0x3F, 0x0E, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x1F, 0x1E, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0x0F, 0xB8, 0xFE, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xC7, 0xF8, 0xFC, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xE1, 0xF1, 0xFE, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xF3, 0x1F, 0xCF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0x1F, 0xC7, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xC7, 0xC7, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xC7, 0xE3, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xE3, 0xF0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xF9, 0xF8, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x38, 0xFC, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0x3D, 0xEE, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0F, 0x03, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x87, 0x01, 0xFE, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xEF, 0x70, 0xFC, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xE1, 0xFC, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x84, 0xF8, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0x18, 0x7C, 0xFC, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x19, 0x7C, + 0xFC, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x31, 0x3C, 0xFC, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0x63, 0x0E, 0xFC, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x87, 0x87, + 0xFC, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xCF, 0xC7, 0x9E, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0x61, 0xCF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x31, + 0xC3, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x38, 0xE1, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xBC, 0xF9, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x9F, + 0xF8, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x1F, 0xBC, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0x0F, 0x07, 0xFE, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0F, + 0x03, 0xFC, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x9F, 0xF1, 0xF8, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xF1, 0xF9, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xF9, 0xF1, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xF9, 0xF9, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xF1, 0xF8, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0x63, 0x18, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x07, 0x06, 0x9E, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0x0F, 0x42, 0x84, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xF1, 0xC0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xF9, 0xE1, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xF8, 0xF0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0x79, 0xF8, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x71, 0xFC, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x03, 0x1E, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0x07, 0x03, 0xFE, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x8F, 0x61, 0xFC, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xDF, 0xC1, 0xF8, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xC1, 0xF9, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x19, 0xF9, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x19, 0xF8, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0x71, 0xFC, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x63, 0xFC, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xC3, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0x0F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xDF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, }; + + + + + + diff --git a/tables/nodes/tests/test_filenode.py b/tables/nodes/tests/test_filenode.py new file mode 100644 index 0000000..3cb3c01 --- /dev/null +++ b/tables/nodes/tests/test_filenode.py @@ -0,0 +1,1040 @@ +"""Unit test for the filenode module.""" + +import os +import shutil +import tempfile +import warnings +from pathlib import Path + +from ... import open_file, file, NoSuchNodeError +from ...nodes import filenode +from ...tests.common import ( + unittest, TempFileMixin, parse_argv, print_versions, + PyTablesTestCase as TestCase) + + +def test_file(name): + from pkg_resources import resource_filename + return resource_filename('tables.nodes.tests', name) + + +class NewFileTestCase(TempFileMixin, TestCase): + """Tests creating a new file node with the new_node() function.""" + + def test00_NewFile(self): + """Creation of a brand new file node.""" + + try: + fnode = filenode.new_node(self.h5file, where='/', name='test') + node = self.h5file.get_node('/test') + except LookupError: + self.fail("filenode.new_node() failed to create a new node.") + else: + self.assertEqual( + fnode.node, node, + "filenode.new_node() created a node in the wrong place.") + + def test01_NewFileTooFewArgs(self): + """Creation of a new file node without arguments for node creation.""" + + self.assertRaises(TypeError, filenode.new_node, self.h5file) + + def test02_NewFileWithExpectedSize(self): + """Creation of a new file node with 'expectedsize' argument.""" + + try: + filenode.new_node( + self.h5file, where='/', name='test', expectedsize=100_000) + except TypeError: + self.fail("filenode.new_node() failed to accept 'expectedsize'" + " argument.") + + def test03_NewFileWithExpectedRows(self): + """Creation of a new file node with illegal 'expectedrows' argument.""" + + self.assertRaises( + TypeError, filenode.new_node, + self.h5file, where='/', name='test', expectedrows=100_000) + + +class ClosedFileTestCase(TempFileMixin, TestCase): + """Tests calling several methods on a closed file.""" + + def setUp(self): + """setUp() -> None + + This method sets the following instance attributes: + * 'h5fname', the name of the temporary HDF5 file + * 'h5file', the writable, temporary HDF5 file with a '/test' node + * 'fnode', the closed file node in '/test' + + """ + + super().setUp() + self.fnode = filenode.new_node(self.h5file, where='/', name='test') + self.fnode.close() + + def tearDown(self): + """tearDown() -> None + + Closes 'h5file'; removes 'h5fname'. + + """ + + self.fnode = None + super().tearDown() + + # All these tests mey seem odd, but Python (2.3) files + # do test whether the file is not closed regardless of their mode. + def test00_Close(self): + """Closing a closed file.""" + + try: + self.fnode.close() + except ValueError: + self.fail("Could not close an already closed file.") + + def test01_Flush(self): + """Flushing a closed file.""" + + self.assertRaises(ValueError, self.fnode.flush) + + def test02_Next(self): + """Getting the next line of a closed file.""" + + self.assertRaises(ValueError, next, self.fnode) + + def test03_Read(self): + """Reading a closed file.""" + + self.assertRaises(ValueError, self.fnode.read) + + def test04_Readline(self): + """Reading a line from a closed file.""" + + self.assertRaises(ValueError, self.fnode.readline) + + def test05_Readlines(self): + """Reading lines from a closed file.""" + + self.assertRaises(ValueError, self.fnode.readlines) + + def test06_Seek(self): + """Seeking a closed file.""" + + self.assertRaises(ValueError, self.fnode.seek, 0) + + def test07_Tell(self): + """Getting the pointer position in a closed file.""" + + self.assertRaises(ValueError, self.fnode.tell) + + def test08_Truncate(self): + """Truncating a closed file.""" + + self.assertRaises(ValueError, self.fnode.truncate) + + def test09_Write(self): + """Writing a closed file.""" + + self.assertRaises(ValueError, self.fnode.write, b'foo') + + def test10_Writelines(self): + """Writing lines to a closed file.""" + + self.assertRaises(ValueError, self.fnode.writelines, [b'foo\n']) + + +def copyFileToFile(srcfile, dstfile, blocksize=4096): + """copyFileToFile(srcfile, dstfile[, blocksize]) -> None + + Copies a readable opened file 'srcfile' to a writable opened file + 'destfile' in blocks of 'blocksize' bytes (4 KiB by default). + + """ + + data = srcfile.read(blocksize) + while len(data) > 0: + dstfile.write(data) + data = srcfile.read(blocksize) + + +class WriteFileTestCase(TempFileMixin, TestCase): + """Tests writing, seeking and truncating a new file node.""" + + datafname = 'test_filenode.dat' + + def setUp(self): + """setUp() -> None + + This method sets the following instance attributes: + * 'h5fname', the name of the temporary HDF5 file + * 'h5file', the writable, temporary HDF5 file with a '/test' node + * 'fnode', the writable file node in '/test' + + """ + + super().setUp() + self.fnode = filenode.new_node(self.h5file, where='/', name='test') + self.datafname = test_file(self.datafname) + + def tearDown(self): + """tearDown() -> None + + Closes 'fnode' and 'h5file'; removes 'h5fname'. + + """ + + self.fnode.close() + self.fnode = None + super().tearDown() + + def test00_WriteFile(self): + """Writing a whole file node.""" + + datafile = open(self.datafname, 'rb') + try: + copyFileToFile(datafile, self.fnode) + finally: + datafile.close() + + def test01_SeekFile(self): + """Seeking and writing file node.""" + + self.fnode.write(b'0123') + self.fnode.seek(8) + self.fnode.write(b'4567') + self.fnode.seek(3) + data = self.fnode.read(6) + self.assertEqual( + data, b'3\0\0\0\0'b'4', + "Gap caused by forward seek was not properly filled.") + + self.fnode.seek(0) + self.fnode.write(b'test') + + self.fnode.seek(0) + data = self.fnode.read(4) + self.assertNotEqual( + data, b'test', "Data was overwritten instead of appended.") + + self.fnode.seek(-4, 2) + data = self.fnode.read(4) + self.assertEqual(data, b'test', "Written data was not appended.") + + self.fnode.seek(0, 2) + oldendoff = self.fnode.tell() + self.fnode.seek(-2, 2) + self.fnode.write(b'test') + newendoff = self.fnode.tell() + self.assertEqual( + newendoff, oldendoff - 2 + 4, + "Pointer was not correctly moved on append.") + + def test02_TruncateFile(self): + """Truncating a file node.""" + + self.fnode.write(b'test') + + self.fnode.seek(2) + self.assertRaises(IOError, self.fnode.truncate) + + self.fnode.seek(6) + self.fnode.truncate() + self.fnode.seek(0) + data = self.fnode.read() + self.assertEqual( + data, b'test\0\0', "File was not grown to the current offset.") + + self.fnode.truncate(8) + self.fnode.seek(0) + data = self.fnode.read() + self.assertEqual( + data, b'test\0\0\0\0', "File was not grown to an absolute size.") + + +class OpenFileTestCase(TempFileMixin, TestCase): + """Tests opening an existing file node for reading and writing.""" + + def setUp(self): + """setUp() -> None + + This method sets the following instance attributes: + * 'h5fname', the name of the temporary HDF5 file + * 'h5file', the writable, temporary HDF5 file with a '/test' node + + """ + + super().setUp() + fnode = filenode.new_node(self.h5file, where='/', name='test') + fnode.close() + + def test00_OpenFileRead(self): + """Opening an existing file node for reading.""" + + node = self.h5file.get_node('/test') + fnode = filenode.open_node(node) + self.assertEqual( + fnode.node, node, "filenode.open_node() opened the wrong node.") + self.assertEqual( + fnode.mode, 'r', + "File was opened with an invalid mode %s." % repr(fnode.mode)) + self.assertEqual( + fnode.tell(), 0, + "Pointer is not positioned at the beginning of the file.") + fnode.close() + + def test01_OpenFileReadAppend(self): + """Opening an existing file node for reading and appending.""" + + node = self.h5file.get_node('/test') + fnode = filenode.open_node(node, 'a+') + self.assertEqual( + fnode.node, node, "filenode.open_node() opened the wrong node.") + self.assertEqual( + fnode.mode, 'a+', + "File was opened with an invalid mode %s." % repr(fnode.mode)) + + self.assertEqual( + fnode.tell(), 0, + "Pointer is not positioned at the beginning of the file.") + fnode.close() + + def test02_OpenFileInvalidMode(self): + """Opening an existing file node with an invalid mode.""" + + self.assertRaises( + IOError, filenode.open_node, self.h5file.get_node('/test'), 'w') + + # This no longer works since type and type version attributes + # are now system attributes. ivb(2004-12-29) + # def test03_OpenFileNoAttrs(self): + # "Opening a node with no type attributes." + # + # node = self.h5file.get_node('/test') + # self.h5file.del_node_attr('/test', '_type') + # # Another way to get the same result is changing the value. + # ##self.h5file.set_node_attr('/test', '_type', 'foobar') + # self.assertRaises(ValueError, filenode.open_node, node) + + +class ReadFileTestCase(TempFileMixin, TestCase): + """Tests reading from an existing file node.""" + + datafname = 'test_filenode.xbm' + + def setUp(self): + """setUp() -> None + + This method sets the following instance attributes: + * 'datafile', the opened data file + * 'h5fname', the name of the temporary HDF5 file + * 'h5file', the writable, temporary HDF5 file with a '/test' node + * 'fnode', the readable file node in '/test', with data in it + + """ + + self.datafname = test_file(self.datafname) + self.datafile = open(self.datafname, 'rb') + + super().setUp() + + fnode = filenode.new_node(self.h5file, where='/', name='test') + copyFileToFile(self.datafile, fnode) + fnode.close() + + self.datafile.seek(0) + self.fnode = filenode.open_node(self.h5file.get_node('/test')) + + def tearDown(self): + """tearDown() -> None + + Closes 'fnode', 'h5file' and 'datafile'; removes 'h5fname'. + + """ + + self.fnode.close() + self.fnode = None + + self.datafile.close() + self.datafile = None + + super().tearDown() + + def test00_CompareFile(self): + """Reading and comparing a whole file node.""" + + import hashlib + dfiledigest = hashlib.md5(self.datafile.read()).digest() + fnodedigest = hashlib.md5(self.fnode.read()).digest() + + self.assertEqual( + dfiledigest, fnodedigest, + "Data read from file node differs from that in the file on disk.") + + def test01_Write(self): + """Writing on a read-only file.""" + + self.assertRaises(IOError, self.fnode.write, 'no way') + + def test02_UseAsImageFile(self): + """Using a file node with Python Imaging Library.""" + + try: + from PIL import Image + + Image.open(self.fnode) + except ImportError: + # PIL not available, nothing to do. + pass + except OSError: + self.fail( + "PIL was not able to create an image from the file node.") + + def test_fileno(self): + self.assertIsNot(self.fnode.fileno(), None) + + +class ReadlineTestCase(TempFileMixin, TestCase): + """Base class for text line-reading test cases. + + It provides a set of tests independent of the line separator string. + Sub-classes must provide the 'line_separator' attribute. + + """ + + def setUp(self): + """This method sets the following instance attributes: + + * ``h5fname``: the name of the temporary HDF5 file. + * ``h5file``: the writable, temporary HDF5 file with a ``/test`` node. + * ``fnode``: the readable file node in ``/test``, with text in it. + + """ + + super().setUp() + + linesep = self.line_separator + + # Fill the node file with some text. + fnode = filenode.new_node(self.h5file, where='/', name='test') + # fnode.line_separator = linesep + fnode.write(linesep) + data = 'short line%sshort line%s%s' % ((linesep.decode('ascii'),) * 3) + data = data.encode('ascii') + fnode.write(data) + fnode.write(b'long line ' * 20 + linesep) + fnode.write(b'unterminated') + fnode.close() + + # Re-open it for reading. + self.fnode = filenode.open_node(self.h5file.get_node('/test')) + # self.fnode.line_separator = linesep + + def tearDown(self): + """tearDown() -> None + + Closes 'fnode' and 'h5file'; removes 'h5fname'. + + """ + + self.fnode.close() + self.fnode = None + super().tearDown() + + def test00_Readline(self): + """Reading individual lines.""" + + linesep = self.line_separator + + line = self.fnode.readline() + self.assertEqual(line, linesep) + + line = self.fnode.readline() # 'short line' + linesep + line = self.fnode.readline() + self.assertEqual(line, b'short line' + linesep) + line = self.fnode.readline() + self.assertEqual(line, linesep) + + line = self.fnode.readline() + self.assertEqual(line, b'long line ' * 20 + linesep) + + line = self.fnode.readline() + self.assertEqual(line, b'unterminated') + + line = self.fnode.readline() + self.assertEqual(line, b'') + + line = self.fnode.readline() + self.assertEqual(line, b'') + + def test01_ReadlineSeek(self): + """Reading individual lines and seeking back and forth.""" + + linesep = self.line_separator + lseplen = len(linesep) + + self.fnode.readline() # linesep + self.fnode.readline() # 'short line' + linesep + + self.fnode.seek(-(lseplen + 4), 1) + line = self.fnode.readline() + self.assertEqual(line, b'line' + linesep, + "Seeking back yielded different data.") + + self.fnode.seek(lseplen + 20, 1) # Into the long line. + line = self.fnode.readline() + self.assertEqual( + line[-(lseplen + 10):], b'long line ' + linesep, + "Seeking forth yielded unexpected data.") + + def test02_Iterate(self): + """Iterating over the lines.""" + + linesep = self.line_separator + + # Iterate to the end. + for line in self.fnode: + pass + + self.assertRaises(StopIteration, next, self.fnode) + + self.fnode.seek(0) + + line = next(self.fnode) + self.assertEqual(line, linesep) + + line = next(self.fnode) + self.assertEqual(line, b'short line' + linesep) + + def test03_Readlines(self): + """Reading a list of lines.""" + + linesep = self.line_separator + + lines = self.fnode.readlines() + self.assertEqual(lines, [ + linesep, b'short line' + linesep, b'short line' + linesep, + linesep, b'long line ' * 20 + linesep, b'unterminated']) + + def test04_ReadlineSize(self): + """Reading individual lines of limited size.""" + + linesep = self.line_separator + lseplen = len(linesep) + + line = self.fnode.readline() # linesep + + line = self.fnode.readline(lseplen + 20) + self.assertEqual(line, b'short line' + linesep) + + line = self.fnode.readline(5) + self.assertEqual(line, b'short') + + line = self.fnode.readline(lseplen + 20) + self.assertEqual(line, b' line' + linesep) + + line = self.fnode.readline(lseplen) + self.assertEqual(line, linesep) + + self.fnode.seek(-4, 2) + line = self.fnode.readline(4) + self.assertEqual(line, b'ated') + + self.fnode.seek(-4, 2) + line = self.fnode.readline(20) + self.assertEqual(line, b'ated') + + def test05_ReadlinesSize(self): + """Reading a list of lines with a limited size.""" + + linesep = self.line_separator + + data = '%sshort line%sshort' % ((linesep.decode('ascii'),) * 2) + data = data.encode('ascii') + lines = self.fnode.readlines(len(data)) + # self.assertEqual(lines, [linesep, b'short line' + linesep, b'short']) + # + # line = self.fnode.readline() + # self.assertEqual(line, b' line' + linesep) + + # NOTE: the test is relaxed because the *hint* parameter of + # io.BaseIO.readlines controls the amout of read data in a coarse way + self.assertEqual(len(lines), len(data.split(b'\n'))) + self.assertEqual(lines[:-1], [linesep, b'short line' + linesep]) + self.assertTrue(lines[-1].startswith(b'short')) + + +class MonoReadlineTestCase(ReadlineTestCase): + """Tests reading one-byte-separated text lines from an existing + file node.""" + + line_separator = b'\n' + + +# class MultiReadlineTestCase(ReadlineTestCase): +# "Tests reading multibyte-separated text lines from an existing file node." +# +# line_separator = b'
' + + +# class LineSeparatorTestCase(TempFileMixin, TestCase): +# "Tests text line separator manipulation in a file node." +# +# def setUp(self): +# """setUp() -> None +# +# This method sets the following instance attributes: +# * 'h5fname', the name of the temporary HDF5 file +# * 'h5file', the writable, temporary HDF5 file with a '/test' node +# * 'fnode', the writable file node in '/test' +# """ +# super().setUp() +# self.fnode = filenode.new_node(self.h5file, where='/', name='test') +# +# def tearDown(self): +# """tearDown() -> None +# +# Closes 'fnode' and 'h5file'; removes 'h5fname'. +# """ +# self.fnode.close() +# self.fnode = None +# super().tearDown() +# +# def test00_DefaultLineSeparator(self): +# "Default line separator." +# +# self.assertEqual( +# self.fnode.line_separator, os.linesep.encode('ascii'), +# "Default line separator does not match that in os.linesep.") +# +# def test01_SetLineSeparator(self): +# "Setting a valid line separator." +# +# try: +# self.fnode.line_separator = b'SEPARATOR' +# except ValueError: +# self.fail("Valid line separator was not accepted.") +# else: +# self.assertEqual( +# self.fnode.line_separator, b'SEPARATOR', +# "Line separator was not correctly set.") +# +# def test02_SetInvalidLineSeparator(self): +# "Setting an invalid line separator." +# +# self.assertRaises( +# ValueError, setattr, self.fnode, 'line_separator', b'') +# self.assertRaises( +# ValueError, setattr, self.fnode, 'line_separator', b'x' * 1024) +# self.assertRaises( +# TypeError, setattr, self.fnode, 'line_separator', 'x') + + +class AttrsTestCase(TempFileMixin, TestCase): + """Tests setting and getting file node attributes.""" + + def setUp(self): + """setUp() -> None + + This method sets the following instance attributes: + * 'h5fname', the name of the temporary HDF5 file + * 'h5file', the writable, temporary HDF5 file with a '/test' node + * 'fnode', the writable file node in '/test' + + """ + + super().setUp() + self.fnode = filenode.new_node(self.h5file, where='/', name='test') + + def tearDown(self): + """tearDown() -> None + + Closes 'fnode' and 'h5file'; removes 'h5fname'. + + """ + + self.fnode.close() + self.fnode = None + super().tearDown() + + # This no longer works since type and type version attributes + # are now system attributes. ivb(2004-12-29) + # def test00_GetTypeAttr(self): + # "Getting the type attribute of a file node." + # + # self.assertEqual( + # getattr(self.fnode.attrs, '_type', None), filenode.NodeType, + # "File node has no '_type' attribute.") + def test00_MangleTypeAttrs(self): + """Mangling the type attributes on a file node.""" + + nodeType = getattr(self.fnode.attrs, 'NODE_TYPE', None) + self.assertEqual( + nodeType, filenode.NodeType, + "File node does not have a valid 'NODE_TYPE' attribute.") + + nodeTypeVersion = getattr(self.fnode.attrs, 'NODE_TYPE_VERSION', None) + self.assertTrue( + nodeTypeVersion in filenode.NodeTypeVersions, + "File node does not have a valid 'NODE_TYPE_VERSION' attribute.") + + # System attributes are now writable. ivb(2004-12-30) + # self.assertRaises( + # AttributeError, + # setattr, self.fnode.attrs, 'NODE_TYPE', 'foobar') + # self.assertRaises( + # AttributeError, + # setattr, self.fnode.attrs, 'NODE_TYPE_VERSION', 'foobar') + + # System attributes are now removables. F. Alted (2007-03-06) +# self.assertRaises( +# AttributeError, +# delattr, self.fnode.attrs, 'NODE_TYPE') +# self.assertRaises( +# AttributeError, +# delattr, self.fnode.attrs, 'NODE_TYPE_VERSION') + + # System attributes are now writable. ivb(2004-12-30) + # def test01_SetSystemAttr(self): + # "Setting a system attribute on a file node." + # + # self.assertRaises( + # AttributeError, setattr, self.fnode.attrs, 'CLASS', 'foobar') + def test02_SetGetDelUserAttr(self): + """Setting a user attribute on a file node.""" + + self.assertEqual( + getattr(self.fnode.attrs, 'userAttr', None), None, + "Inexistent attribute has a value that is not 'None'.") + + self.fnode.attrs.userAttr = 'foobar' + self.assertEqual( + getattr(self.fnode.attrs, 'userAttr', None), 'foobar', + "User attribute was not correctly set.") + + self.fnode.attrs.userAttr = 'bazquux' + self.assertEqual( + getattr(self.fnode.attrs, 'userAttr', None), 'bazquux', + "User attribute was not correctly changed.") + + del self.fnode.attrs.userAttr + self.assertEqual( + getattr(self.fnode.attrs, 'userAttr', None), None, + "User attribute was not deleted.") + # Another way is looking up the attribute in the attribute list. + # if 'userAttr' in self.fnode.attrs._f_list(): + # self.fail("User attribute was not deleted.") + + def test03_AttrsOnClosedFile(self): + """Accessing attributes on a closed file node.""" + + self.fnode.close() + self.assertRaises(AttributeError, getattr, self.fnode, 'attrs') + + +class ClosedH5FileTestCase(TempFileMixin, TestCase): + """Tests accessing a file node in a closed PyTables file.""" + + def setUp(self): + """setUp() -> None + + This method sets the following instance attributes: + * 'h5fname', the name of the temporary HDF5 file + * 'h5file', the closed HDF5 file with a '/test' node + * 'fnode', the writable file node in '/test' + + """ + + super().setUp() + self.fnode = filenode.new_node(self.h5file, where='/', name='test') + self.h5file.close() + + def tearDown(self): + """tearDown() -> None + + Closes 'fnode'; removes 'h5fname'. + + """ + + # ivilata: We know that a UserWarning will be raised + # because the PyTables file has already been closed. + # However, we don't want it to pollute the test output. + warnings.filterwarnings('ignore', category=UserWarning) + try: + self.fnode.close() + except ValueError: + pass + finally: + warnings.filterwarnings('default', category=UserWarning) + + self.fnode = None + super().tearDown() + + def test00_Write(self): + """Writing to a file node in a closed PyTables file.""" + + self.assertRaises(ValueError, self.fnode.write, 'data') + + def test01_Attrs(self): + """Accessing the attributes of a file node in a closed + PyTables file.""" + + self.assertRaises(ValueError, getattr, self.fnode, 'attrs') + + +class OldVersionTestCase(TestCase): + """Base class for old version compatibility test cases. + + It provides some basic tests for file operations and attribute handling. + Sub-classes must provide the 'oldversion' attribute + and the 'oldh5fname' attribute. + + """ + + def setUp(self): + """This method sets the following instance attributes: + + * ``h5fname``: the name of the temporary HDF5 file. + * ``h5file``: the writable, temporary HDF5 file with a ``/test`` node. + * ``fnode``: the readable file node in ``/test``. + + """ + + super().setUp() + self.h5fname = tempfile.mktemp(suffix='.h5') + + self.oldh5fname = test_file(self.oldh5fname) + oldh5f = open_file(self.oldh5fname) + oldh5f.copy_file(self.h5fname) + oldh5f.close() + + self.h5file = open_file( + self.h5fname, 'r+', + title="Test for file node old version compatibility") + self.fnode = filenode.open_node(self.h5file.root.test, 'a+') + + def tearDown(self): + """Closes ``fnode`` and ``h5file``; removes ``h5fname``.""" + + self.fnode.close() + self.fnode = None + self.h5file.close() + self.h5file = None + Path(self.h5fname).unlink() + super().tearDown() + + def test00_Read(self): + """Reading an old version file node.""" + + # self.fnode.line_separator = '\n' + + line = self.fnode.readline() + self.assertEqual(line, 'This is only\n') + + line = self.fnode.readline() + self.assertEqual(line, 'a test file\n') + + line = self.fnode.readline() + self.assertEqual(line, 'for FileNode version %d\n' % self.oldversion) + + line = self.fnode.readline() + self.assertEqual(line, '') + + self.fnode.seek(0) + line = self.fnode.readline() + self.assertEqual(line, 'This is only\n') + + def test01_Write(self): + """Writing an old version file node.""" + + # self.fnode.line_separator = '\n' + + self.fnode.write('foobar\n') + self.fnode.seek(-7, 2) + line = self.fnode.readline() + self.assertEqual(line, 'foobar\n') + + def test02_Attributes(self): + """Accessing attributes in an old version file node.""" + + self.fnode.attrs.userAttr = 'foobar' + self.assertEqual( + getattr(self.fnode.attrs, 'userAttr', None), 'foobar', + "User attribute was not correctly set.") + + self.fnode.attrs.userAttr = 'bazquux' + self.assertEqual( + getattr(self.fnode.attrs, 'userAttr', None), 'bazquux', + "User attribute was not correctly changed.") + + del self.fnode.attrs.userAttr + self.assertEqual( + getattr(self.fnode.attrs, 'userAttr', None), None, + "User attribute was not deleted.") + + +class Version1TestCase(OldVersionTestCase): + """Basic test for version 1 format compatibility.""" + + oldversion = 1 + oldh5fname = 'test_filenode_v1.h5' + + +class DirectReadWriteTestCase(TempFileMixin, TestCase): + + datafname = 'test_filenode.dat' + + def setUp(self): + """This method sets the following instance attributes: + + * ``h5fname``: the name of the temporary HDF5 file. + * ``h5file``, the writable, temporary HDF5 file with a '/test' node + * ``datafname``: the name of the data file to be stored in the + temporary HDF5 file. + * ``data``: the contents of the file ``datafname`` + * ``testfname``: the name of a temporary file to be written to. + + """ + + super().setUp() + self.datafname = test_file(self.datafname) + self.testfname = tempfile.mktemp() + self.testh5fname = tempfile.mktemp(suffix=".h5") + self.data = Path(self.datafname).read_bytes() + self.testdir = tempfile.mkdtemp() + + def tearDown(self): + """tearDown() -> None + + Closes 'fnode' and 'h5file'; removes 'h5fname'. + + """ + if os.access(self.testfname, os.R_OK): + Path(self.testfname).unlink() + if os.access(self.testh5fname, os.R_OK): + Path(self.testh5fname).unlink() + shutil.rmtree(self.testdir) + super().tearDown() + + def test01_WriteToPathlibPath(self): + testh5fname = Path(self.testh5fname) + datafname = Path(self.datafname) + filenode.save_to_filenode(testh5fname, datafname, "/test1") + + def test01_WriteToFilename(self): + # write contents of datafname to h5 testfile + filenode.save_to_filenode(self.testh5fname, self.datafname, "/test1") + # make sure writing to an existing node doesn't work ... + self.assertRaises(IOError, filenode.save_to_filenode, self.testh5fname, + self.datafname, "/test1") + # ... except if overwrite is True + filenode.save_to_filenode(self.testh5fname, self.datafname, "/test1", + overwrite=True) + # write again, this time specifying a name + filenode.save_to_filenode(self.testh5fname, self.datafname, "/", + name="test2") + # read from test h5file + filenode.read_from_filenode(self.testh5fname, self.testfname, "/test1") + # and compare result to what it should be + self.assertEqual(Path(self.testfname).read_bytes(), self.data) + # make sure extracting to an existing file doesn't work ... + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + self.assertRaises(IOError, filenode.read_from_filenode, + self.testh5fname, self.testfname, "/test1") + # except overwrite is True. And try reading with a name + filenode.read_from_filenode(self.testh5fname, self.testfname, "/", + name="test2", overwrite=True) + # and compare to what it should be + self.assertEqual(Path(self.testfname).read_bytes(), self.data) + # cleanup + Path(self.testfname).unlink() + Path(self.testh5fname).unlink() + + def test02_WriteToHDF5File(self): + # write contents of datafname to h5 testfile + filenode.save_to_filenode(self.h5file, self.datafname, "/test1") + # make sure writing to an existing node doesn't work ... + self.assertRaises(IOError, filenode.save_to_filenode, self.h5file, + self.datafname, "/test1") + # ... except if overwrite is True + filenode.save_to_filenode(self.h5file, self.datafname, "/test1", + overwrite=True) + # read from test h5file + filenode.read_from_filenode(self.h5file, self.testfname, "/test1") + # and compare result to what it should be + self.assertEqual(Path(self.testfname).read_bytes(), self.data) + # make sure extracting to an existing file doesn't work ... + self.assertRaises(IOError, filenode.read_from_filenode, self.h5file, + self.testfname, "/test1") + # make sure the original h5file is still alive and kicking + self.assertEqual(isinstance(self.h5file, file.File), True) + self.assertEqual(self.h5file.mode, "w") + + def test03_AutomaticNameGuessing(self): + # write using the filename as node name + filenode.save_to_filenode(self.testh5fname, self.datafname, "/") + # and read again + datafname = Path(self.datafname).name + filenode.read_from_filenode(self.testh5fname, self.testdir, "/", + name=datafname.replace(".", "_")) + # test if the output file really has the expected name + self.assertEqual(os.access(Path(self.testdir) / datafname, os.R_OK), + True) + # and compare result to what it should be + self.assertEqual((Path(self.testdir) / datafname).read_bytes(), + self.data) + + def test04_AutomaticNameGuessingWithFilenameAttribute(self): + # write using the filename as node name + filenode.save_to_filenode(self.testh5fname, self.datafname, "/") + # and read again + datafname = Path(self.datafname).name + filenode.read_from_filenode(self.testh5fname, self.testdir, "/", + name=datafname) + # test if the output file really has the expected name + self.assertEqual(os.access(Path(self.testdir) / datafname, os.R_OK), + True) + # and compare result to what it should be + self.assertEqual((Path(self.testdir) / datafname).read_bytes(), + self.data) + + def test05_ReadFromNonexistingNodeRaises(self): + # write using the filename as node name + filenode.save_to_filenode(self.testh5fname, self.datafname, "/") + # and read again + self.assertRaises(NoSuchNodeError, filenode.read_from_filenode, + self.testh5fname, self.testdir, "/", + name="THISNODEDOESNOTEXIST") + + +def suite(): + """suite() -> test suite + + Returns a test suite consisting of all the test cases in the module. + + """ + + theSuite = unittest.TestSuite() + + theSuite.addTest(unittest.makeSuite(NewFileTestCase)) + theSuite.addTest(unittest.makeSuite(ClosedFileTestCase)) + theSuite.addTest(unittest.makeSuite(WriteFileTestCase)) + theSuite.addTest(unittest.makeSuite(OpenFileTestCase)) + theSuite.addTest(unittest.makeSuite(ReadFileTestCase)) + theSuite.addTest(unittest.makeSuite(MonoReadlineTestCase)) + # theSuite.addTest(unittest.makeSuite(MultiReadlineTestCase)) + # theSuite.addTest(unittest.makeSuite(LineSeparatorTestCase)) + theSuite.addTest(unittest.makeSuite(AttrsTestCase)) + theSuite.addTest(unittest.makeSuite(ClosedH5FileTestCase)) + theSuite.addTest(unittest.makeSuite(DirectReadWriteTestCase)) + + return theSuite + + +if __name__ == '__main__': + import sys + parse_argv(sys.argv) + print_versions() + unittest.main(defaultTest='suite') diff --git a/tables/nodes/tests/test_filenode.xbm b/tables/nodes/tests/test_filenode.xbm new file mode 100644 index 0000000..a25f84e --- /dev/null +++ b/tables/nodes/tests/test_filenode.xbm @@ -0,0 +1,52 @@ +#define test_width 64 +#define test_height 64 +static char test_bits[] = { + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xF3, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xF1, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x7F, 0xC0, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0x7F, 0xC4, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0x3F, 0x0E, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x1F, 0x1E, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0x0F, 0xB8, 0xFE, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xC7, 0xF8, 0xFC, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xE1, 0xF1, 0xFE, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xF3, 0x1F, 0xCF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0x1F, 0xC7, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xC7, 0xC7, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xC7, 0xE3, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xE3, 0xF0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xF9, 0xF8, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x38, 0xFC, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0x3D, 0xEE, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0F, 0x03, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x87, 0x01, 0xFE, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xEF, 0x70, 0xFC, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xE1, 0xFC, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x84, 0xF8, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0x18, 0x7C, 0xFC, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x19, 0x7C, + 0xFC, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x31, 0x3C, 0xFC, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0x63, 0x0E, 0xFC, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x87, 0x87, + 0xFC, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xCF, 0xC7, 0x9E, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0x61, 0xCF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x31, + 0xC3, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x38, 0xE1, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xBC, 0xF9, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x9F, + 0xF8, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x1F, 0xBC, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0x0F, 0x07, 0xFE, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0F, + 0x03, 0xFC, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x9F, 0xF1, 0xF8, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xF1, 0xF9, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xF9, 0xF1, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xF9, 0xF9, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xF1, 0xF8, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0x63, 0x18, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x07, 0x06, 0x9E, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0x0F, 0x42, 0x84, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xF1, 0xC0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xF9, 0xE1, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xF8, 0xF0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0x79, 0xF8, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x71, 0xFC, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x03, 0x1E, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0x07, 0x03, 0xFE, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x8F, 0x61, 0xFC, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xDF, 0xC1, 0xF8, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xC1, 0xF9, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x19, 0xF9, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x19, 0xF8, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0x71, 0xFC, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x63, 0xFC, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xC3, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0x0F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xDF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, }; + + + + + + diff --git a/tables/nodes/tests/test_filenode_v1.h5 b/tables/nodes/tests/test_filenode_v1.h5 new file mode 100644 index 0000000..a370b3d Binary files /dev/null and b/tables/nodes/tests/test_filenode_v1.h5 differ diff --git a/tables/parameters.py b/tables/parameters.py new file mode 100644 index 0000000..9acc99f --- /dev/null +++ b/tables/parameters.py @@ -0,0 +1,444 @@ +"""Parameters for PyTables.""" + +import os as _os + + +__docformat__ = 'reStructuredText' +"""The format of documentation strings in this module.""" + +_KB = 1024 +"""The size of a Kilobyte in bytes""" + +_MB = 1024 * _KB +"""The size of a Megabyte in bytes""" + +# Tunable parameters +# ================== +# Be careful when touching these! + +# Parameters for different internal caches +# ---------------------------------------- + +BOUNDS_MAX_SIZE = 1 * _MB +"""The maximum size for bounds values cached during index lookups.""" + +BOUNDS_MAX_SLOTS = 4 * _KB +"""The maximum number of slots for the BOUNDS cache.""" + +ITERSEQ_MAX_ELEMENTS = 1 * _KB +"""The maximum number of iterator elements cached in data lookups.""" + +ITERSEQ_MAX_SIZE = 1 * _MB +"""The maximum space that will take ITERSEQ cache (in bytes).""" + +ITERSEQ_MAX_SLOTS = 128 +"""The maximum number of slots in ITERSEQ cache.""" + +LIMBOUNDS_MAX_SIZE = 256 * _KB +"""The maximum size for the query limits (for example, ``(lim1, lim2)`` +in conditions like ``lim1 <= col < lim2``) cached during index lookups +(in bytes).""" + +LIMBOUNDS_MAX_SLOTS = 128 +"""The maximum number of slots for LIMBOUNDS cache.""" + +TABLE_MAX_SIZE = 1 * _MB +"""The maximum size for table chunks cached during index queries.""" + +SORTED_MAX_SIZE = 1 * _MB +"""The maximum size for sorted values cached during index lookups.""" + +SORTEDLR_MAX_SIZE = 8 * _MB +"""The maximum size for chunks in last row cached in index lookups (in +bytes).""" + +SORTEDLR_MAX_SLOTS = 1 * _KB +"""The maximum number of chunks for SORTEDLR cache.""" + + +# Parameters for general cache behaviour +# -------------------------------------- +# +# The next parameters will not be effective if passed to the +# `open_file()` function (so, they can only be changed in a *global* +# way). You can change them in the file, but this is strongly +# discouraged unless you know well what you are doing. + +DISABLE_EVERY_CYCLES = 10 +"""The number of cycles in which a cache will be forced to be disabled +if the hit ratio is lower than the LOWEST_HIT_RATIO (see below). This +value should provide time enough to check whether the cache is being +efficient or not.""" + +ENABLE_EVERY_CYCLES = 50 +"""The number of cycles in which a cache will be forced to be +(re-)enabled, irregardingly of the hit ratio. This will provide a chance +for checking if we are in a better scenario for doing caching again.""" + +LOWEST_HIT_RATIO = 0.6 +"""The minimum acceptable hit ratio for a cache to avoid disabling (and +freeing) it.""" + + +# Tunable parameters +# ================== +# Be careful when touching these! + +# Recommended maximum values +# -------------------------- + +# Following are the recommended values for several limits. However, +# these limits are somewhat arbitrary and can be increased if you have +# enough resources. + +MAX_COLUMNS = 512 +"""Maximum number of columns in :class:`tables.Table` objects before a +:exc:`tables.PerformanceWarning` is issued. This limit is somewhat +arbitrary and can be increased. +""" + +MAX_NODE_ATTRS = 4 * _KB +"""Maximum allowed number of attributes in a node.""" + +MAX_GROUP_WIDTH = 16 * _KB +"""Maximum allowed number of children hanging from a group.""" + +MAX_TREE_DEPTH = 2 * _KB +"""Maximum depth in object tree allowed.""" + +MAX_UNDO_PATH_LENGTH = 10 * _KB +"""Maximum length of paths allowed in undo/redo operations.""" + + +# Cache limits +# ------------ + +COND_CACHE_SLOTS = 128 +"""Maximum number of conditions for table queries to be kept in memory.""" + +CHUNK_CACHE_NELMTS = 521 +"""Number of elements for HDF5 chunk cache.""" + +CHUNK_CACHE_PREEMPT = 0.0 +"""Chunk preemption policy. This value should be between 0 and 1 +inclusive and indicates how much chunks that have been fully read are +favored for preemption. A value of zero means fully read chunks are +treated no differently than other chunks (the preemption is strictly +LRU) while a value of one means fully read chunks are always preempted +before other chunks.""" + +CHUNK_CACHE_SIZE = 2 * _MB +"""Size (in bytes) for HDF5 chunk cache.""" + +# Size for new metadata cache system +METADATA_CACHE_SIZE = 1 * _MB # 1 MB is the default for HDF5 +"""Size (in bytes) of the HDF5 metadata cache.""" + + +# NODE_CACHE_SLOTS tells the number of nodes that fits in the cache. +# +# There are several forces driving the election of this number: +# 1.- As more nodes, better chances to re-use nodes +# --> better performance +# 2.- As more nodes, the re-ordering of the LRU cache takes more time +# --> less performance +# 3.- As more nodes, the memory needs for PyTables grows, specially for table +# writings (that could take double of memory than table reads!). +# +# The default value here is quite conservative. If you have a system +# with tons of memory, and if you are touching regularly a very large +# number of leaves, try increasing this value and see if it fits better +# for you. Please report back your feedback. +NODE_CACHE_SLOTS = 64 +"""Maximum number of nodes to be kept in the metadata cache. + +It is the number of nodes to be kept in the metadata cache. Least recently +used nodes are unloaded from memory when this number of loaded nodes is +reached. To load a node again, simply access it as usual. +Nodes referenced by user variables and, in general, all nodes that are still +open are registered in the node manager and can be quickly accessed even +if they are not in the cache. + +Negative value means that all the touched nodes will be kept in an +internal dictionary. This is the faster way to load/retrieve nodes. +However, and in order to avoid a large memory comsumption, the user will +be warned when the number of loaded nodes will reach the +``-NODE_CACHE_SLOTS`` value. + +Finally, a value of zero means that any cache mechanism is disabled. +""" + + +# Parameters for the I/O buffer in `Leaf` objects +# ----------------------------------------------- + +IO_BUFFER_SIZE = 1 * _MB +"""The PyTables internal buffer size for I/O purposes. Should not +exceed the amount of highest level cache size in your CPU.""" + +BUFFER_TIMES = 100 +"""The maximum buffersize/rowsize ratio before issuing a +:exc:`tables.PerformanceWarning`.""" + + +# Miscellaneous +# ------------- + +EXPECTED_ROWS_EARRAY = 1000 +"""Default expected number of rows for :class:`EArray` objects.""" + +EXPECTED_ROWS_VLARRAY = 1000 +"""Default expected number of rows for :class:`VLArray` objects. + +.. versionadded:: 3.0 + +""" + +EXPECTED_ROWS_TABLE = 10_000 +"""Default expected number of rows for :class:`Table` objects.""" + +PYTABLES_SYS_ATTRS = True +"""Set this to ``False`` if you don't want to create PyTables system +attributes in datasets. Also, if set to ``False`` the possible existing +system attributes are not considered for guessing the class of the node +during its loading from disk (this work is delegated to the PyTables' +class discoverer function for general HDF5 files).""" + +MAX_NUMEXPR_THREADS = _os.environ.get("NUMEXPR_MAX_THREADS", 4) +"""The maximum number of threads that PyTables should use internally in +Numexpr. If `None`, it is automatically set to the number of cores in +your machine. In general, it is a good idea to set this to the number of +cores in your machine or, when your machine has many of them (e.g. > 8), +perhaps stay at 8 at maximum. In general, 4 threads is a good tradeoff.""" + +MAX_BLOSC_THREADS = 1 # 1 is safe for concurrency +"""The maximum number of threads that PyTables should use internally in +Blosc. If `None`, it is automatically set to the number of cores in +your machine. For applications that use several PyTables instances +concurrently and so as to avoid locking problems, the recommended value +is 1. In other cases a value of 2 or 4 could make sense. + +""" + +USER_BLOCK_SIZE = 0 +"""Sets the user block size of a file. + +The default user block size is 0; it may be set to any power of 2 equal +to 512 or greater (512, 1024, 2048, etc.). + +.. versionadded:: 3.0 + +""" + +ALLOW_PADDING = True +"""Allow padding in compound data types. + +Starting on version 3.5 padding is honored during copies, or when tables +are created from NumPy structured arrays with padding (e.g. `align=True`). +If you actually want to get rid of any possible padding in new +datasets/attributes (i.e. the previous behaviour), set this to `False`. + +.. versionadded:: 3.5 + +""" + + +# HDF5 driver management +# ---------------------- +DRIVER = None +"""The HDF5 driver that should be used for reading/writing to the file. + +Following drivers are supported: + + * H5FD_SEC2: this driver uses POSIX file-system functions like read + and write to perform I/O to a single, permanent file on local + disk with no system buffering. + This driver is POSIX-compliant and is the default file driver for + all systems. + + * H5FD_DIRECT: this is the H5FD_SEC2 driver except data is written + to or read from the file synchronously without being cached by + the system. + + * H5FD_WINDOWS: this driver was modified in HDF5-1.8.8 to be a + wrapper of the POSIX driver, H5FD_SEC2. This change should not + affect user applications. + + * H5FD_STDIO: this driver uses functions from the standard C + stdio.h to perform I/O to a single, permanent file on local disk + with additional system buffering. + + * H5FD_CORE: with this driver, an application can work with a file + in memory for faster reads and writes. File contents are kept in + memory until the file is closed. At closing, the memory version + of the file can be written back to disk or abandoned. + + * H5FD_SPLIT: this file driver splits a file into two parts. + One part stores metadata, and the other part stores raw data. + This splitting a file into two parts is a limited case of the + Multi driver. + +The following drivers are not currently supported: + + * H5FD_LOG: this is the H5FD_SEC2 driver with logging capabilities. + + * H5FD_FAMILY: with this driver, the HDF5 file’s address space is + partitioned into pieces and sent to separate storage files using + an underlying driver of the user’s choice. + This driver is for systems that do not support files larger than + 2 gigabytes. + + * H5FD_MULTI: with this driver, data can be stored in multiple + files according to the type of the data. I/O might work better if + data is stored in separate files based on the type of data. + The Split driver is a special case of this driver. + + * H5FD_MPIO: this is the standard HDF5 file driver for parallel + file systems. This driver uses the MPI standard for both + communication and file I/O. + + * H5FD_MPIPOSIX: this parallel file system driver uses MPI for + communication and POSIX file-system calls for file I/O. + + * H5FD_STREAM: this driver is no longer available. + +.. seealso:: the `Drivers section`_ of the `HDF5 User's Guide`_ for + more information. + +.. note:: + + not all supported drivers are always available. For example the + H5FD_WINDOWS driver is not available on non Windows platforms. + + If the user try to use a driver that is not available on the target + platform a :exc:`RuntimeError` is raised. + +.. versionadded:: 3.0 + +.. _`Drivers section`: + http://www.hdfgroup.org/HDF5/doc/UG/08_TheFile.html#Drivers +.. _`HDF5 User's Guide`: http://www.hdfgroup.org/HDF5/doc/UG/index.html + +""" + +DRIVER_DIRECT_ALIGNMENT = 0 +"""Specifies the required alignment boundary in memory. + +A value of 0 (zero) means to use HDF5 Library’s default value. + +.. versionadded:: 3.0 + +""" + +DRIVER_DIRECT_BLOCK_SIZE = 0 +"""Specifies the file system block size. + +A value of 0 (zero) means to use HDF5 Library’s default value of 4KB. + +.. versionadded:: 3.0 + +""" + +DRIVER_DIRECT_CBUF_SIZE = 0 +"""Specifies the copy buffer size. + +A value of 0 (zero) means to use HDF5 Library’s default value. + +.. versionadded:: 3.0 + +""" + +# DRIVER_LOG_FLAGS = 0x0001ffff +# """Flags specifying the types of logging activity. +# +# .. versionadded:: 3.0 +# +# .. seeealso:: +# http://www.hdfgroup.org/HDF5/doc/RM/RM_H5P.html#Property-SetFaplLog +# +# """ +# +# DRIVER_LOG_BUF_SIZE = 4 * _KB +# """The size of the logging buffers, in bytes. +# +# One buffer of size DRIVER_LOG_BUF_SIZE will be created for each of +# H5FD_LOG_FILE_READ, H5FD_LOG_FILE_WRITE and H5FD_LOG_FLAVOR when those +# flags are set; these buffers will not grow as the file increases in +# size. +# +# .. versionadded:: 3.0 +# +# """ + +DRIVER_CORE_INCREMENT = 64 * _KB +"""Core driver memory increment. + +Specifies the increment by which allocated memory is to be increased +each time more memory is required. + +.. versionadded:: 3.0 + +""" + +DRIVER_CORE_BACKING_STORE = 1 +"""Enables backing store for the core driver. + +With the H5FD_CORE driver, if the DRIVER_CORE_BACKING_STORE is set +to 1 (True), the file contents are flushed to a file with the same name +as this core file when the file is closed or access to the file is +terminated in memory. + +The application is allowed to open an existing file with H5FD_CORE +driver. In that case, if the DRIVER_CORE_BACKING_STORE is set to 1 and +the flags for :func:`tables.open_file` is set to H5F_ACC_RDWR, any change +to the file contents are saved to the file when the file is closed. +If backing_store is set to 0 and the flags for :func:`tables.open_file` +is set to H5F_ACC_RDWR, any change to the file contents will be lost +when the file is closed. If the flags for :func:`tables.open_file` is +set to H5F_ACC_RDONLY, no change to the file is allowed either in +memory or on file. + +.. versionadded:: 3.0 + +""" + +DRIVER_CORE_IMAGE = None +"""String containing an HDF5 file image. + +If this option is passed to the :func:`tables.open_file` function then the +returned file object is set up using the specified image. + +A file image can be retrieved from an existing (and opened) file object +using the :meth:`tables.File.get_file_image` method. + +.. note:: requires HDF5 >= 1.8.9. + +.. versionadded:: 3.0 + +""" + +DRIVER_SPLIT_META_EXT = '-m.h5' +"""The extension for the metadata file used by the H5FD_SPLIT driver. + +If this option is passed to the :func:`tables.openFile` function along +with driver='H5FD_SPLIT', the extension is appended to the name passed +as the first parameter to form the name of the metadata file. If the +string '%s' is used in the extension, the metadata file name is formed +by replacing '%s' with the name passed as the first parameter instead. + +.. versionadded:: 3.1 + +""" + +DRIVER_SPLIT_RAW_EXT = '-r.h5' +"""The extension for the raw data file used by the H5FD_SPLIT driver. + +If this option is passed to the :func:`tables.openFile` function along +with driver='H5FD_SPLIT', the extension is appended to the name passed +as the first parameter to form the name of the raw data file. If the +string '%s' is used in the extension, the raw data file name is formed +by replacing '%s' with the name passed as the first parameter instead. + +.. versionadded:: 3.1 + +""" diff --git a/tables/path.py b/tables/path.py new file mode 100644 index 0000000..b3ea051 --- /dev/null +++ b/tables/path.py @@ -0,0 +1,221 @@ +"""Functionality related with node paths in a PyTables file. + +Variables +========= + +`__docformat`__ + The format of documentation strings in this module. + +""" + +import re +import warnings +import keyword + +from .exceptions import NaturalNameWarning + +__docformat__ = 'reStructuredText' +"""The format of documentation strings in this module.""" + + +_python_id_re = re.compile('^[a-zA-Z_][a-zA-Z0-9_]*$') +"""Python identifier regular expression.""" + +_reserved_id_re = re.compile('^_[cfgv]_') +"""PyTables reserved identifier regular expression. + +- c: class variables +- f: class public methods +- g: class private methods +- v: instance variables +""" + +_hidden_name_re = re.compile('^_[pi]_') +"""Nodes with a name *matching* this expression are considered hidden. + +For instance, ``name`` whould be visible while ``_i_name`` would not. +""" + +_hidden_path_re = re.compile('/_[pi]_') +"""Nodes with a path *containing* this expression are considered hidden. + +For instance, a node with a pathname like ``/a/b/c`` would be visible +while nodes with pathnames like ``/a/c/_i_x`` or ``/a/_p_x/y`` would +not. +""" + +_warnInfo = ( + "you will not be able to use natural naming to access this object; " + "using ``getattr()`` will still work, though") +"""Warning printed when a name will not be reachable through natural naming""" + + +def check_attribute_name(name): + """Check the validity of the `name` of an attribute in AttributeSet. + + If the name is not valid, a ``ValueError`` is raised. If it is + valid but it can not be used with natural naming, a + `NaturalNameWarning` is issued. + + >>> warnings.simplefilter("ignore") + >>> check_attribute_name('a') + >>> check_attribute_name('a_b') + >>> check_attribute_name('a:b') # NaturalNameWarning + >>> check_attribute_name('/a/b') # NaturalNameWarning + >>> check_attribute_name('/') # NaturalNameWarning + >>> check_attribute_name('.') # NaturalNameWarning + >>> check_attribute_name('__members__') + Traceback (most recent call last): + ... + ValueError: ``__members__`` is not allowed as an object name + >>> check_attribute_name(1) + Traceback (most recent call last): + ... + TypeError: object name is not a string: 1 + >>> check_attribute_name('') + Traceback (most recent call last): + ... + ValueError: the empty string is not allowed as an object name + """ + if not isinstance(name, str): # Python >= 2.3 + raise TypeError(f"object name is not a string: {name!r}") + + if name == '': + raise ValueError("the empty string is not allowed as an object name") + + # Check whether `name` is a valid Python identifier. + if not _python_id_re.match(name): + warnings.warn("object name is not a valid Python identifier: %r; " + "it does not match the pattern ``%s``; %s" + % (name, _python_id_re.pattern, _warnInfo), + NaturalNameWarning, stacklevel=2) + return + + # However, Python identifiers and keywords have the same form. + if keyword.iskeyword(name): + warnings.warn("object name is a Python keyword: %r; %s" + % (name, _warnInfo), NaturalNameWarning, stacklevel=2) + return + + # Still, names starting with reserved prefixes are not allowed. + if _reserved_id_re.match(name): + raise ValueError("object name starts with a reserved prefix: %r; " + "it matches the pattern ``%s``" + % (name, _reserved_id_re.pattern)) + + # ``__members__`` is the only exception to that rule. + if name == '__members__': + raise ValueError("``__members__`` is not allowed as an object name") + + +def check_name_validity(name): + """Check the validity of the `name` of a Node object, which more limited + than attribute names. + + If the name is not valid, a ``ValueError`` is raised. If it is + valid but it can not be used with natural naming, a + `NaturalNameWarning` is issued. + + >>> warnings.simplefilter("ignore") + >>> check_name_validity('a') + >>> check_name_validity('a_b') + >>> check_name_validity('a:b') # NaturalNameWarning + >>> check_name_validity('/a/b') + Traceback (most recent call last): + ... + ValueError: the ``/`` character is not allowed in object names: '/a/b' + >>> check_name_validity('.') + Traceback (most recent call last): + ... + ValueError: ``.`` is not allowed as an object name + >>> check_name_validity('') + Traceback (most recent call last): + ... + ValueError: the empty string is not allowed as an object name + + """ + check_attribute_name(name) + + # Check whether `name` is a valid HDF5 name. + # http://hdfgroup.org/HDF5/doc/UG/03_Model.html#Structure + if name == '.': + raise ValueError("``.`` is not allowed as an object name") + elif '/' in name: + raise ValueError("the ``/`` character is not allowed " + "in object names: %r" % name) + + +def join_path(parentpath, name): + """Join a *canonical* `parentpath` with a *non-empty* `name`. + + .. versionchanged:: 3.0 + The *parentPath* parameter has been renamed into *parentpath*. + + >>> join_path('/', 'foo') + '/foo' + >>> join_path('/foo', 'bar') + '/foo/bar' + >>> join_path('/foo', '/foo2/bar') + '/foo/foo2/bar' + >>> join_path('/foo', '/') + '/foo' + + """ + + if name.startswith('./'): # Support relative paths (mainly for links) + name = name[2:] + if parentpath == '/' and name.startswith('/'): + pstr = '%s' % name + elif parentpath == '/' or name.startswith('/'): + pstr = f'{parentpath}{name}' + else: + pstr = f'{parentpath}/{name}' + if pstr.endswith('/'): + pstr = pstr[:-1] + return pstr + + +def split_path(path): + """Split a *canonical* `path` into a parent path and a node name. + + The result is returned as a tuple. The parent path does not + include a trailing slash. + + >>> split_path('/') + ('/', '') + >>> split_path('/foo/bar') + ('/foo', 'bar') + + """ + + lastslash = path.rfind('/') + ppath = path[:lastslash] + name = path[lastslash + 1:] + + if ppath == '': + ppath = '/' + + return (ppath, name) + + +def isvisiblename(name): + """Does this `name` make the named node a visible one?""" + + return _hidden_name_re.match(name) is None + + +def isvisiblepath(path): + """Does this `path` make the named node a visible one?""" + + return _hidden_path_re.search(path) is None + + +def _test(): + """Run ``doctest`` on this module.""" + + import doctest + doctest.testmod() + + +if __name__ == '__main__': + _test() diff --git a/tables/registry.py b/tables/registry.py new file mode 100644 index 0000000..ea139c3 --- /dev/null +++ b/tables/registry.py @@ -0,0 +1,77 @@ +"""Miscellaneous mappings used to avoid circular imports. + +Variables: + +`class_name_dict` + Node class name to class object mapping. +`class_id_dict` + Class identifier to class object mapping. + +Misc variables: + +`__docformat__` + The format of documentation strings in this module. + +""" + + +# Important: no modules from PyTables should be imported here +# (but standard modules are OK), since the main reason for this module +# is avoiding circular imports! + +__docformat__ = 'reStructuredText' +"""The format of documentation strings in this module.""" + +class_name_dict = {} +"""Node class name to class object mapping. + +This dictionary maps class names (e.g. ``'Group'``) to actual class +objects (e.g. `Group`). Classes are registered here when they are +defined, and they are not expected to be unregistered (by now), but they +can be replaced when the module that defines them is reloaded. + +.. versionchanged:: 3.0 + The *classNameDict* dictionary has been renamed into *class_name_dict*. + +""" + +class_id_dict = {} +"""Class identifier to class object mapping. + +This dictionary maps class identifiers (e.g. ``'GROUP'``) to actual +class objects (e.g. `Group`). Classes defining a new ``_c_classid`` +attribute are registered here when they are defined, and they are not +expected to be unregistered (by now), but they can be replaced when the +module that defines them is reloaded. + +.. versionchanged:: 3.0 + The *classIdDict* dictionary has been renamed into *class_id_dict*. + +""" + +# Deprecated API +classNameDict = class_name_dict +classIdDict = class_id_dict + + +def get_class_by_name(classname): + """Get the node class matching the `classname`. + + If the name is not registered, a ``TypeError`` is raised. The empty + string and ``None`` are also accepted, and mean the ``Node`` class. + + .. versionadded:: 3.0 + + """ + + # The empty string is accepted for compatibility + # with old default arguments. + if classname is None or classname == '': + classname = 'Node' + + # Get the class object corresponding to `classname`. + if classname not in class_name_dict: + raise TypeError("there is no registered node class named ``%s``" + % (classname,)) + + return class_name_dict[classname] diff --git a/tables/req_versions.py b/tables/req_versions.py new file mode 100644 index 0000000..f5f11f4 --- /dev/null +++ b/tables/req_versions.py @@ -0,0 +1,15 @@ +"""Required versions for PyTables dependencies.""" + +from packaging.version import Version + +# ********************************************************************** +# Keep these in sync with setup.cfg and user's guide +# ********************************************************************** + +# Minimum recommended versions for mandatory packages +min_numpy_version = Version('1.9.3') +min_numexpr_version = Version('2.6.2') +min_hdf5_version = Version('1.8.4') +min_blosc_version = Version("1.4.1") +min_blosc_bitshuffle_version = Version("1.8.0") +"""The minumum Blosc version where BitShuffle can be used safely.""" diff --git a/tables/scripts/__init__.py b/tables/scripts/__init__.py new file mode 100644 index 0000000..12fc730 --- /dev/null +++ b/tables/scripts/__init__.py @@ -0,0 +1,6 @@ +"""Utility scripts for PyTables. + +This package contains some modules which provide a ``main()`` function +(with no arguments), so that they can be used as scripts. + +""" diff --git a/tables/scripts/pt2to3.py b/tables/scripts/pt2to3.py new file mode 100644 index 0000000..0c3949c --- /dev/null +++ b/tables/scripts/pt2to3.py @@ -0,0 +1,513 @@ +"""This utility helps you migrate from PyTables 2.x APIs to 3.x APIs, which +are PEP 8 compliant. + +""" +import re +import sys +import argparse +from pathlib import Path + +old2newnames = dict([ + # from __init__.py + ('hdf5Version', 'hdf5_version'), # data + # from array.py + ('parentNode', 'parentnode'), # kwarg + ('getEnum', 'get_enum'), + ('_initLoop', '_init_loop'), + ('_fancySelection', '_fancy_selection'), + ('_checkShape', '_check_shape'), + ('_readSlice', '_read_slice'), + ('_readCoords', '_read_coords'), + ('_readSelection', '_read_selection'), + ('_writeSlice', '_write_slice'), + ('_writeCoords', '_write_coords'), + ('_writeSelection', '_write_selection'), + ('_g_copyWithStats', '_g_copy_with_stats'), + ('_c_classId', '_c_classid'), # attr + # from atom.py + ('_checkBase', '_checkbase'), + # from attributeset.py + ('newSet', 'newset'), # kwarg + ('copyClass', 'copyclass'), # kwarg + ('_g_updateNodeLocation', '_g_update_node_location'), + ('_g_logAdd', '_g_log_add'), + ('_g_delAndLog', '_g_del_and_log'), + ('_v__nodeFile', '_v__nodefile'), # attr (private) + ('_v__nodePath', '_v__nodepath'), # attr (private) + # from carray.py + # ('parentNode', 'parentnode'), # kwarg + # from description.py + ('_g_setNestedNamesDescr', '_g_set_nested_names_descr'), + ('_g_setPathNames', '_g_set_path_names'), + ('_v_colObjects', '_v_colobjects'), # attr + ('_v_nestedFormats', '_v_nested_formats'), # attr + ('_v_nestedNames', '_v_nested_names'), # attr + ('_v_nestedDescr', '_v_nested_descr'), # attr + ('getColsInOrder', 'get_cols_in_order'), + ('joinPaths', 'join_paths'), + ('metaIsDescription', 'MetaIsDescription'), + # from earray.py + # ('parentNode', 'parentnode'), # kwarg + ('_checkShapeAppend', '_check_shape_append'), + # from expression.py + ('_exprvarsCache', '_exprvars_cache'), # attr (private) + ('_requiredExprVars', '_required_expr_vars'), + ('setInputsRange', 'set_inputs_range'), + ('setOutput', 'set_output'), + ('setOutputRange', 'set_output_range'), + # from file.py + ('_opToCode', '_op_to_code'), # data (private) + ('_codeToOp', '_code_to_op'), # data (private) + ('_transVersion', '_trans_version'), # data (private) + ('_transGroupParent', '_trans_group_parent'), # data (private) + ('_transGroupName', '_trans_group_name'), # data (private) + ('_transGroupPath', '_trans_group_path'), # data (private) + ('_actionLogParent', '_action_log_parent'), # data (private) + ('_actionLogName', '_action_log_name'), # data (private) + ('_actionLogPath', '_action_log_path'), # data (private) + ('_transParent', '_trans_parent'), # data (private) + ('_transName', '_trans_name'), # data (private) + ('_transPath', '_trans_path'), # data (private) + ('_shadowParent', '_shadow_parent'), # data (private) + ('_shadowName', '_shadow_name'), # data (private) + ('_shadowPath', '_shadow_path'), # data (private) + ('copyFile', 'copy_file'), + ('openFile', 'open_file'), + ('_getValueFromContainer', '_get_value_from_container'), + ('__getRootGroup', '__get_root_group'), + ('rootUEP', 'root_uep'), # attr + ('_getOrCreatePath', '_get_or_create_path'), + ('_createPath', '_create_path'), + ('createGroup', 'create_group'), + ('createTable', 'create_table'), + ('createArray', 'create_array'), + ('createCArray', 'create_carray'), + ('createEArray', 'create_earray'), + ('createVLArray', 'create_vlarray'), + ('createHardLink', 'create_hard_link'), + ('createSoftLink', 'create_soft_link'), + ('createExternalLink', 'create_external_link'), + ('_getNode', '_get_node'), + ('getNode', 'get_node'), + ('isVisibleNode', 'is_visible_node'), + ('renameNode', 'rename_node'), + ('moveNode', 'move_node'), + ('copyNode', 'copy_node'), + ('removeNode', 'remove_node'), + ('getNodeAttr', 'get_node_attr'), + ('setNodeAttr', 'set_node_attr'), + ('delNodeAttr', 'del_node_attr'), + ('copyNodeAttrs', 'copy_node_attrs'), + ('copyChildren', 'copy_children'), + ('listNodes', 'list_nodes'), + ('iterNodes', 'iter_nodes'), + ('walkNodes', 'walk_nodes'), + ('walkGroups', 'walk_groups'), + ('_checkOpen', '_check_open'), + ('_isWritable', '_iswritable'), + ('_checkWritable', '_check_writable'), + ('_checkGroup', '_check_group'), + ('isUndoEnabled', 'is_undo_enabled'), + ('_checkUndoEnabled', '_check_undo_enabled'), + ('_createTransactionGroup', '_create_transaction_group'), + ('_createTransaction', '_create_transaction'), + ('_createMark', '_create_mark'), + ('enableUndo', 'enable_undo'), + ('disableUndo', 'disable_undo'), + ('_getMarkID', '_get_mark_id'), + ('_getFinalAction', '_get_final_action'), + ('getCurrentMark', 'get_current_mark'), + ('_updateNodeLocations', '_update_node_locations'), + # from group.py + # ('parentNode', 'parentnode'), # kwarg + # ('ptFile', 'ptfile'), # kwarg + ('_getValueFromContainer', '_get_value_from_container'), + ('_g_postInitHook', '_g_post_init_hook'), + ('_g_getChildGroupClass', '_g_get_child_group_class'), + ('_g_getChildLeafClass', '_g_get_child_leaf_class'), + ('_g_addChildrenNames', '_g_add_children_names'), + ('_g_checkHasChild', '_g_check_has_child'), + ('_f_walkNodes', '_f_walknodes'), + ('_g_widthWarning', '_g_width_warning'), + ('_g_refNode', '_g_refnode'), + ('_g_unrefNode', '_g_unrefnode'), + ('_g_copyChildren', '_g_copy_children'), + ('_f_getChild', '_f_get_child'), + ('_f_listNodes', '_f_list_nodes'), + ('_f_iterNodes', '_f_iter_nodes'), + ('_f_walkGroups', '_f_walk_groups'), + ('_g_closeDescendents', '_g_close_descendents'), + ('_f_copyChildren', '_f_copy_children'), + ('_v_maxGroupWidth', '_v_max_group_width'), # attr + ('_v_objectID', '_v_objectid'), # attr + ('_g_loadChild', '_g_load_child'), + ('childName', 'childname'), # ??? + ('_c_shadowNameRE', '_c_shadow_name_re'), # attr (private) + # from hdf5extension.p{yx,xd} + ('hdf5Extension', 'hdf5extension'), + ('_getFileId', '_get_file_id'), + ('_flushFile', '_flush_file'), + ('_closeFile', '_close_file'), + ('_g_listAttr', '_g_list_attr'), + ('_g_setAttr', '_g_setattr'), + ('_g_getAttr', '_g_getattr'), + ('_g_listGroup', '_g_list_group'), + ('_g_getGChildAttr', '_g_get_gchild_attr'), + ('_g_getLChildAttr', '_g_get_lchild_attr'), + ('_g_flushGroup', '_g_flush_group'), + ('_g_closeGroup', '_g_close_group'), + ('_g_moveNode', '_g_move_node'), + ('_convertTime64', '_convert_time64'), + ('_createArray', '_create_array'), + ('_createCArray', '_create_carray'), + ('_openArray', '_open_array'), + ('_readArray', '_read_array'), + ('_g_readSlice', '_g_read_slice'), + ('_g_readCoords', '_g_read_coords'), + ('_g_readSelection', '_g_read_selection'), + ('_g_writeSlice', '_g_write_slice'), + ('_g_writeCoords', '_g_write_coords'), + ('_g_writeSelection', '_g_write_selection'), + # from idxutils.py + ('calcChunksize', 'calc_chunksize'), + ('infinityF', 'infinityf'), # data + ('infinityMap', 'infinitymap'), # data + ('infType', 'inftype'), + ('StringNextAfter', 'string_next_after'), + ('IntTypeNextAfter', 'int_type_next_after'), + ('BoolTypeNextAfter', 'bool_type_next_after'), + # from index.py + # ('parentNode', 'parentnode'), # kwarg + ('defaultAutoIndex', 'default_auto_index'), # data + ('defaultIndexFilters', 'default_index_filters'), # data + ('_tableColumnPathnameOfIndex', '_table_column_pathname_of_index'), + ('_is_CSI', '_is_csi'), + ('is_CSI', 'is_csi'), # property + ('appendLastRow', 'append_last_row'), + ('read_sliceLR', 'read_slice_lr'), + ('readSorted', 'read_sorted'), + ('readIndices', 'read_indices'), + ('_processRange', '_process_range'), + ('searchLastRow', 'search_last_row'), + ('getLookupRange', 'get_lookup_range'), + ('_g_checkName', '_g_check_name'), + # from indexes.py + # ('parentNode', 'parentnode'), # kwarg + ('_searchBin', '_search_bin'), + # from indexesextension + ('indexesExtension', 'indexesextension'), + ('initRead', 'initread'), + ('readSlice', 'read_slice'), + ('_readIndexSlice', '_read_index_slice'), + ('_initSortedSlice', '_init_sorted_slice'), + ('_g_readSortedSlice', '_g_read_sorted_slice'), + ('_readSortedSlice', '_read_sorted_slice'), + ('getLRUbounds', 'get_lru_bounds'), + ('getLRUsorted', 'get_lru_sorted'), + ('_searchBinNA_b', '_search_bin_na_b'), + ('_searchBinNA_ub', '_search_bin_na_ub'), + ('_searchBinNA_s', '_search_bin_na_s'), + ('_searchBinNA_us', '_search_bin_na_us'), + ('_searchBinNA_i', '_search_bin_na_i'), + ('_searchBinNA_ui', '_search_bin_na_ui'), + ('_searchBinNA_ll', '_search_bin_na_ll'), + ('_searchBinNA_ull', '_search_bin_na_ull'), + ('_searchBinNA_e', '_search_bin_na_e'), + ('_searchBinNA_f', '_search_bin_na_f'), + ('_searchBinNA_d', '_search_bin_na_d'), + ('_searchBinNA_g', '_search_bin_na_g'), + # from leaf.py + # ('parentNode', 'parentnode'), # kwarg + ('objectID', 'object_id'), # property + ('_processRangeRead', '_process_range_read'), + ('_pointSelection', '_point_selection'), + ('isVisible', 'isvisible'), + ('getAttr', 'get_attr'), + ('setAttr', 'set_attr'), + ('delAttr', 'del_attr'), + # from link.py + # ('parentNode', 'parentnode'), # kwarg + ('_g_getLinkClass', '_g_get_link_class'), + # from linkextension + ('linkExtension', 'linkextension'), + ('_getLinkClass', '_get_link_class'), + ('_g_createHardLink', '_g_create_hard_link'), + # from lrucacheextension + ('lrucacheExtension', 'lrucacheextension'), + # from misc/enum.py + ('_checkAndSetPair', '_check_and_set_pair'), + ('_getContainer', '_get_container'), + # from misc/proxydict.py + ('containerRef', 'containerref'), # attr + # from node.py + # ('parentNode', 'parentnode'), # kwarg + ('_g_logCreate', '_g_log_create'), + ('_g_preKillHook', '_g_pre_kill_hook'), + ('_g_checkOpen', '_g_check_open'), + ('_g_setLocation', '_g_set_location'), + ('_g_updateLocation', '_g_update_location'), + ('_g_delLocation', '_g_del_location'), + ('_g_updateDependent', '_g_update_dependent'), + ('_g_removeAndLog', '_g_remove_and_log'), + ('_g_logMove', '_g_log_move'), + ('oldPathname', 'oldpathname'), # ?? + ('_g_copyAsChild', '_g_copy_as_child'), + ('_f_isVisible', '_f_isvisible'), + ('_g_checkGroup', '_g_check_group'), + ('_g_checkNotContains', '_g_check_not_contains'), + ('_g_maybeRemove', '_g_maybe_remove'), + ('_f_getAttr', '_f_getattr'), + ('_f_setAttr', '_f_setattr'), + ('_f_delAttr', '_f_delattr'), + ('_v_maxTreeDepth', '_v_maxtreedepth'), # attr + # from nodes/filenode.py + ('newNode', 'new_node'), + ('openNode', 'open_node'), + ('_lineChunkSize', '_line_chunksize'), # attr (private) + ('_lineSeparator', '_line_separator'), # attr (private) + # ('getLineSeparator', 'get_line_separator'), # dropped + # ('setLineSeparator', 'set_line_separator'), # dropped + # ('delLineSeparator', 'del_line_separator'), # dropped + # ('lineSeparator', 'line_separator'), # property -- dropped + ('_notReadableError', '_not_readable_error'), + ('_appendZeros', '_append_zeros'), + ('getAttrs', '_get_attrs'), + ('setAttrs', '_set_attrs'), + ('delAttrs', '_del_attrs'), + ('_setAttributes', '_set_attributes'), + ('_checkAttributes', '_check_attributes'), + ('_checkNotClosed', '_check_not_closed'), + ('__allowedInitKwArgs', '__allowed_init_kwargs'), # attr (private) + ('_byteShape', '_byte_shape'), # attr (private) + ('_sizeToShape', '_size_to_shape'), # attr (private) + ('_vType', '_vtype'), # attr (private) + ('_vShape', '_vshape'), # attr (private) + # from path.py + ('parentPath', 'parentpath'), # kwarg + ('_pythonIdRE', '_python_id_re'), # attr (private) + ('_reservedIdRE', '_reserved_id_re'), # attr (private) + ('_hiddenNameRE', '_hidden_name_re'), # attr (private) + ('_hiddenPathRE', '_hidden_path_re'), # attr (private) + ('checkNameValidity', 'check_name_validity'), + ('joinPath', 'join_path'), + ('splitPath', 'split_path'), + ('isVisibleName', 'isvisiblename'), + ('isVisiblePath', 'isvisiblepath'), + # from registry.py + ('className', 'classname'), # kwarg + ('classNameDict', 'class_name_dict'), # data + ('classIdDict', 'class_id_dict'), # data + ('getClassByName', 'get_class_by_name'), + # from scripts/ptdump.py + ('dumpLeaf', 'dump_leaf'), + ('dumpGroup', 'dump_group'), + # from scripts/ptrepack.py + ('newdstGroup', 'newdst_group'), + ('recreateIndexes', 'recreate_indexes'), + ('copyLeaf', 'copy_leaf'), + # from table.py + # ('parentNode', 'parentnode'), # kwarg + ('_nxTypeFromNPType', '_nxtype_from_nptype'), # data (private) + ('_npSizeType', '_npsizetype'), # data (private) + ('_indexNameOf', '_index_name_of'), + ('_indexPathnameOf', '_index_pathname_of'), + ('_indexPathnameOfColumn', '_index_pathname_of_column'), + ('_indexNameOf_', '_index_name_of_'), + ('_indexPathnameOf_', '_index_pathname_of_'), + ('_indexPathnameOfColumn_', '_index_pathname_of_column_'), + ('_table__setautoIndex', '_table__setautoindex'), + ('_table__getautoIndex', '_table__getautoindex'), + ('_table__autoIndex', '_table__autoindex'), # data (private) + ('_table__whereIndexed', '_table__where_indexed'), + ('createIndexesTable', 'create_indexes_table'), + ('createIndexesDescr', 'create_indexes_descr'), + ('_column__createIndex', '_column__create_index'), + ('_autoIndex', '_autoindex'), # attr + ('autoIndex', 'autoindex'), # attr + ('_useIndex', '_use_index'), + ('_whereCondition', '_where_condition'), # attr (private) + ('_conditionCache', '_condition_cache'), # attr (private) + # ('_exprvarsCache', '_exprvars_cache'), + ('_enabledIndexingInQueries', + '_enabled_indexing_in_queries'), # attr (private) + ('_emptyArrayCache', '_empty_array_cache'), # attr (private) + ('_getTypeColNames', '_get_type_col_names'), + ('_getEnumMap', '_get_enum_map'), + ('_cacheDescriptionData', '_cache_description_data'), + ('_getColumnInstance', '_get_column_instance'), + ('_checkColumn', '_check_column'), + ('_disableIndexingInQueries', '_disable_indexing_in_queries'), + ('_enableIndexingInQueries', '_enable_indexing_in_queries'), + # ('_requiredExprVars', '_required_expr_vars'), + ('_getConditionKey', '_get_condition_key'), + ('_compileCondition', '_compile_condition'), + ('willQueryUseIndexing', 'will_query_use_indexing'), + ('readWhere', 'read_where'), + ('whereAppend', 'append_where'), + ('getWhereList', 'get_where_list'), + ('_check_sortby_CSI', '_check_sortby_csi'), + ('_readCoordinates', '_read_coordinates'), + ('readCoordinates', 'read_coordinates'), + ('_saveBufferedRows', '_save_buffered_rows'), + ('modifyCoordinates', 'modify_coordinates'), + ('modifyRows', 'modify_rows'), + ('modifyColumn', 'modify_column'), + ('modifyColumns', 'modify_columns'), + ('flushRowsToIndex', 'flush_rows_to_index'), + ('_addRowsToIndex', '_add_rows_to_index'), + ('removeRows', 'remove_rows'), + ('_setColumnIndexing', '_set_column_indexing'), + ('_markColumnsAsDirty', '_mark_columns_as_dirty'), + ('_reIndex', '_reindex'), + ('_doReIndex', '_do_reindex'), + ('reIndex', 'reindex'), + ('reIndexDirty', 'reindex_dirty'), + ('_g_copyRows', '_g_copy_rows'), + ('_g_copyRows_optim', '_g_copy_rows_optim'), + ('_g_propIndexes', '_g_prop_indexes'), + ('_g_updateTableLocation', '_g_update_table_location'), + ('_tableFile', '_table_file'), # attr (private) + ('_tablePath', '_table_path'), # attr (private) + ('createIndex', 'create_index'), + ('createCSIndex', 'create_csindex'), + ('removeIndex', 'remove_index'), + # from tableextension + ('tableExtension', 'tableextension'), + ('getNestedFieldCache', 'get_nested_field_cache'), + ('getNestedType', 'get_nested_type'), + ('_createTable', '_create_table'), + ('_getInfo', '_get_info'), + ('indexChunk', 'indexchunk'), # attr + ('indexValid', 'indexvalid'), # attr + ('indexValues', 'indexvalues'), # attr + ('bufcoordsData', 'bufcoords_data'), # attr + ('indexValuesData', 'index_values_data'), # attr + ('chunkmapData', 'chunkmap_data'), # attr + ('indexValidData', 'index_valid_data'), # attr + ('whereCond', 'wherecond'), # attr + ('iterseqMaxElements', 'iterseq_max_elements'), # attr + ('IObuf', 'iobuf'), # attr + ('IObufcpy', 'iobufcpy'), # attr + ('_convertTime64_', '_convert_time64_'), + ('_convertTypes', '_convert_types'), + ('_newBuffer', '_new_buffer'), + ('__next__inKernel', '__next__inkernel'), + ('_fillCol', '_fill_col'), + ('_flushBufferedRows', '_flush_buffered_rows'), + ('_getUnsavedNrows', '_get_unsaved_nrows'), + ('_flushModRows', '_flush_mod_rows'), + # from undoredo.py + ('moveToShadow', 'move_to_shadow'), + ('moveFromShadow', 'move_from_shadow'), + ('undoCreate', 'undo_create'), + ('redoCreate', 'redo_create'), + ('undoRemove', 'undo_remove'), + ('redoRemove', 'redo_remove'), + ('undoMove', 'undo_move'), + ('redoMove', 'redo_move'), + ('attrToShadow', 'attr_to_shadow'), + ('attrFromShadow', 'attr_from_shadow'), + ('undoAddAttr', 'undo_add_attr'), + ('redoAddAttr', 'redo_add_attr'), + ('undoDelAttr', 'undo_del_attr'), + ('redoDelAttr', 'redo_del_attr'), + # from utils.py + ('convertToNPAtom', 'convert_to_np_atom'), + ('convertToNPAtom2', 'convert_to_np_atom2'), + ('checkFileAccess', 'check_file_access'), + ('logInstanceCreation', 'log_instance_creation'), + ('fetchLoggedInstances', 'fetch_logged_instances'), + ('countLoggedInstances', 'count_logged_instances'), + ('listLoggedInstances', 'list_logged_instances'), + ('dumpLoggedInstances', 'dump_logged_instances'), + ('detectNumberOfCores', 'detect_number_of_cores'), + # from utilsextension + ('utilsExtension', 'utilsextension'), + ('PTTypeToHDF5', 'pttype_to_hdf5'), # data + ('PTSpecialKinds', 'pt_special_kinds'), # data + ('NPExtPrefixesToPTKinds', 'npext_prefixes_to_ptkinds'), # data + ('HDF5ClassToString', 'hdf5_class_to_string'), # data + ('setBloscMaxThreads', 'set_blosc_max_threads'), + ('silenceHDF5Messages', 'silence_hdf5_messages'), + ('isHDF5File', 'is_hdf5_file'), + ('isPyTablesFile', 'is_pytables_file'), + ('getHDF5Version', 'get_hdf5_version'), + ('getPyTablesVersion', 'get_pytables_version'), + ('whichLibVersion', 'which_lib_version'), + ('whichClass', 'which_class'), + ('getNestedField', 'get_nested_field'), + ('getFilters', 'get_filters'), + ('getTypeEnum', 'get_type_enum'), + ('enumFromHDF5', 'enum_from_hdf5'), + ('enumToHDF5', 'enum_to_hdf5'), + ('AtomToHDF5Type', 'atom_to_hdf5_type'), + ('loadEnum', 'load_enum'), + ('HDF5ToNPNestedType', 'hdf5_to_np_nested_type'), + ('HDF5ToNPExtType', 'hdf5_to_np_ext_type'), + ('AtomFromHDF5Type', 'atom_from_hdf5_type'), + ('createNestedType', 'create_nested_type'), + # from unimlemented.py + ('_openUnImplemented', '_open_unimplemented'), + # from vlarray.py + # ('parentNode', 'parentnode'), # kwarg + # ('expectedsizeinMB', 'expected_mb'), # --> expectedrows + # ('_v_expectedsizeinMB', '_v_expected_mb'), # --> expectedrows +]) + +new2oldnames = {v: k for k, v in old2newnames.items()} + +# Note that it is tempting to use the ast module here, but then this +# breaks transforming cython files. So instead we are going to do the +# dumb thing with replace. + + +def make_subs(ns): + names = new2oldnames if ns.reverse else old2newnames + s = r'(?<=\W)({})(?=\W)'.format('|'.join(list(names))) + if ns.ignore_previous: + s += r'(?!\s*?=\s*?previous_api(_property)?\()' + s += r'(?!\* to \*\w+\*)' + s += r'(?!\* parameter has been renamed into \*\w+\*\.)' + s += r'(?! is pending deprecation, import \w+ instead\.)' + subs = re.compile(s, flags=re.MULTILINE) + + def repl(m): + return names.get(m.group(1), m.group(0)) + return subs, repl + + +def main(): + desc = ('PyTables 2.x -> 3.x API transition tool\n\n' + 'This tool displays to standard out, so it is \n' + 'common to pipe this to another file:\n\n' + '$ pt2to3 oldfile.py > newfile.py') + parser = argparse.ArgumentParser(description=desc) + parser.add_argument('-r', '--reverse', action='store_true', default=False, + dest='reverse', + help="reverts changes, going from 3.x -> 2.x.") + parser.add_argument('-p', '--no-ignore-previous', action='store_false', + default=True, dest='ignore_previous', + help="ignores previous_api() calls.") + parser.add_argument('-o', default=None, dest='output', + help="output file to write to.") + parser.add_argument('-i', '--inplace', action='store_true', default=False, + dest='inplace', help="overwrites the file in-place.") + parser.add_argument('filename', help='path to input file.') + ns = parser.parse_args() + + if not Path(ns.filename).is_file(): + sys.exit(f'file {ns.filename!r} not found') + src = Path(ns.filename).read_text() + + subs, repl = make_subs(ns) + targ = subs.sub(repl, src) + + ns.output = ns.filename if ns.inplace else ns.output + if ns.output is None: + sys.stdout.write(targ) + else: + Path(ns.output).write_text(targ) + + +if __name__ == '__main__': + main() diff --git a/tables/scripts/ptdump.py b/tables/scripts/ptdump.py new file mode 100644 index 0000000..a492e8d --- /dev/null +++ b/tables/scripts/ptdump.py @@ -0,0 +1,173 @@ +"""This utility lets you look into the data and metadata of your data files. + +Pass the flag -h to this for help on usage. + +""" + + +import argparse +import operator + +import tables as tb + + +# default options +options = argparse.Namespace( + rng=slice(None), + showattrs=0, + verbose=0, + dump=0, + colinfo=0, + idxinfo=0, +) + + +def dump_leaf(leaf): + if options.verbose: + print(repr(leaf)) + else: + print(str(leaf)) + if options.showattrs: + print(" "+repr(leaf.attrs)) + if options.dump and not isinstance(leaf, tb.unimplemented.UnImplemented): + print(" Data dump:") + # print((leaf.read(options.rng.start, options.rng.stop, + # options.rng.step)) + # This is better for large objects + if options.rng.start is None: + start = 0 + else: + start = options.rng.start + if options.rng.stop is None: + if leaf.shape != (): + stop = leaf.shape[0] + else: + stop = options.rng.stop + if options.rng.step is None: + step = 1 + else: + step = options.rng.step + if leaf.shape == (): + print("[SCALAR] %s" % (leaf[()])) + else: + for i in range(start, stop, step): + print("[{}] {}".format(i, leaf[i])) + + if isinstance(leaf, tb.table.Table) and options.colinfo: + # Show info of columns + for colname in leaf.colnames: + print(repr(leaf.cols._f_col(colname))) + + if isinstance(leaf, tb.table.Table) and options.idxinfo: + # Show info of indexes + for colname in leaf.colnames: + col = leaf.cols._f_col(colname) + if isinstance(col, tb.table.Column) and col.index is not None: + idx = col.index + print(repr(idx)) + + +def dump_group(pgroup, sort=False): + node_kinds = pgroup._v_file._node_kinds[1:] + what = pgroup._f_walk_groups() + if sort: + what = sorted(what, key=operator.attrgetter('_v_pathname')) + for group in what: + print(str(group)) + if options.showattrs: + print(" "+repr(group._v_attrs)) + for kind in node_kinds: + for node in group._f_list_nodes(kind): + if options.verbose or options.dump: + dump_leaf(node) + else: + print(str(node)) + + +def _get_parser(): + parser = argparse.ArgumentParser( + description='''The ptdump utility allows you look into the contents + of your PyTables files. It lets you see not only the data but also + the metadata (that is, the *structure* and additional information in + the form of *attributes*).''') + + parser.add_argument( + '-v', '--verbose', action='store_true', + help='dump more metainformation on nodes', + ) + parser.add_argument( + '-d', '--dump', action='store_true', + help='dump data information on leaves', + ) + parser.add_argument( + '-a', '--showattrs', action='store_true', + help='show attributes in nodes (only useful when -v or -d are active)', + ) + parser.add_argument( + '-s', '--sort', action='store_true', + help='sort output by node name', + ) + parser.add_argument( + '-c', '--colinfo', action='store_true', + help='''show info of columns in tables (only useful when -v or -d + are active)''', + ) + parser.add_argument( + '-i', '--idxinfo', action='store_true', + help='''show info of indexed columns (only useful when -v or -d are + active)''', + ) + parser.add_argument( + '-R', '--range', dest='rng', metavar='RANGE', + help='''select a RANGE of rows (in the form "start,stop,step") + during the copy of *all* the leaves. + Default values are "None,None,1", which means a copy of all the + rows.''', + ) + parser.add_argument('src', metavar='filename[:nodepath]', + help='name of the HDF5 file to dump') + + return parser + + +def main(): + parser = _get_parser() + + args = parser.parse_args(namespace=options) + + # Get the options + if isinstance(args.rng, str): + try: + options.rng = eval("slice(" + args.rng + ")") + except Exception: + parser.error("Error when getting the range parameter.") + else: + args.dump = 1 + + # Catch the files passed as the last arguments + src = args.src.rsplit(':', 1) + if len(src) == 1: + filename, nodename = src[0], "/" + else: + filename, nodename = src + if nodename == "": + # case where filename == "filename:" instead of "filename:/" + nodename = "/" + + try: + h5file = tb.open_file(filename, 'r') + except Exception as e: + return 'Cannot open input file: ' + str(e) + + with h5file: + # Check whether the specified node is a group or a leaf + nodeobject = h5file.get_node(nodename) + if isinstance(nodeobject, tb.group.Group): + # Close the file again and reopen using the root_uep + dump_group(nodeobject, args.sort) + elif isinstance(nodeobject, tb.leaf.Leaf): + # If it is not a Group, it must be a Leaf + dump_leaf(nodeobject) + else: + # This should never happen + print("Unrecognized object:", nodeobject) diff --git a/tables/scripts/ptrepack.py b/tables/scripts/ptrepack.py new file mode 100644 index 0000000..11067b7 --- /dev/null +++ b/tables/scripts/ptrepack.py @@ -0,0 +1,570 @@ +"""This utility lets you repack your data files in a flexible way. + +Pass the flag -h to this for help on usage. + +""" + +import argparse +import sys +import warnings +from pathlib import Path +from time import perf_counter as clock +from time import process_time as cpuclock + +import tables as tb + + +# Global variables +verbose = False +regoldindexes = True +createsysattrs = True + +numpy_aliases = [ + 'numeric', + 'Numeric', + 'numarray', + 'NumArray', + 'CharArray', +] + + +def newdst_group(dstfileh, dstgroup, title, filters): + group = dstfileh.root + # Now, create the new group. This works even if dstgroup == '/' + for nodename in dstgroup.split('/'): + if nodename == '': + continue + # First try if possible intermediate groups does already exist. + try: + group2 = dstfileh.get_node(group, nodename) + except tb.exceptions.NoSuchNodeError: + # The group does not exist. Create it. + group2 = dstfileh.create_group(group, nodename, + title=title, + filters=filters) + group = group2 + return group + + +def recreate_indexes(table, dstfileh, dsttable): + listoldindexes = table._listoldindexes + if listoldindexes != []: + if not regoldindexes: + if verbose: + print("[I]Not regenerating indexes for table: '%s:%s'" % + (dstfileh.filename, dsttable._v_pathname)) + return + # Now, recreate the indexed columns + if verbose: + print("[I]Regenerating indexes for table: '%s:%s'" % + (dstfileh.filename, dsttable._v_pathname)) + for colname in listoldindexes: + if verbose: + print("[I]Indexing column: '%s'. Please wait..." % colname) + colobj = dsttable.cols._f_col(colname) + # We don't specify the filters for the indexes + colobj.create_index(filters=None) + + +def copy_leaf(srcfile, dstfile, srcnode, dstnode, title, + filters, copyuserattrs, overwritefile, overwrtnodes, stats, + start, stop, step, chunkshape, sortby, check_CSI, + propindexes, upgradeflavors, allow_padding): + # Open the source file + srcfileh = tb.open_file(srcfile, 'r', allow_padding=allow_padding) + # Get the source node (that should exist) + srcnode = srcfileh.get_node(srcnode) + + # Get the destination node and its parent + last_slash = dstnode.rindex('/') + if last_slash == len(dstnode)-1: + # print("Detected a trailing slash in destination node. " + # "Interpreting it as a destination group.") + dstgroup = dstnode[:-1] + elif last_slash > 0: + dstgroup = dstnode[:last_slash] + else: + dstgroup = "/" + dstleaf = dstnode[last_slash + 1:] + if dstleaf == "": + dstleaf = srcnode.name + # Check whether the destination group exists or not + if Path(dstfile).is_file() and not overwritefile: + dstfileh = tb.open_file(dstfile, 'a', + pytables_sys_attrs=createsysattrs, + allow_padding=allow_padding) + try: + dstgroup = dstfileh.get_node(dstgroup) + except Exception: + # The dstgroup does not seem to exist. Try creating it. + dstgroup = newdst_group(dstfileh, dstgroup, title, filters) + else: + # The node exists, but it is really a group? + if not isinstance(dstgroup, tb.group.Group): + # No. Should we overwrite it? + if overwrtnodes: + parent = dstgroup._v_parent + last_slash = dstgroup._v_pathname.rindex('/') + dstgroupname = dstgroup._v_pathname[last_slash + 1:] + dstgroup.remove() + dstgroup = dstfileh.create_group(parent, dstgroupname, + title=title, + filters=filters) + else: + raise RuntimeError("Please check that the node names are " + "not duplicated in destination, and " + "if so, add the --overwrite-nodes " + "flag if desired.") + else: + # The destination file does not exist or will be overwritten. + dstfileh = tb.open_file(dstfile, 'w', title=title, filters=filters, + pytables_sys_attrs=createsysattrs, + allow_padding=allow_padding) + dstgroup = newdst_group(dstfileh, dstgroup, title="", filters=filters) + + # Finally, copy srcnode to dstnode + try: + dstnode = srcnode.copy( + dstgroup, dstleaf, filters=filters, + copyuserattrs=copyuserattrs, overwrite=overwrtnodes, + stats=stats, start=start, stop=stop, step=step, + chunkshape=chunkshape, + sortby=sortby, check_CSI=check_CSI, propindexes=propindexes) + except Exception: + (type_, value, traceback) = sys.exc_info() + print("Problems doing the copy from '%s:%s' to '%s:%s'" % + (srcfile, srcnode, dstfile, dstnode)) + print(f"The error was --> {type_}: {value}") + print("The destination file looks like:\n", dstfileh) + # Close all the open files: + srcfileh.close() + dstfileh.close() + raise RuntimeError("Please check that the node names are not " + "duplicated in destination, and if so, add " + "the --overwrite-nodes flag if desired.") + + # Upgrade flavors in dstnode, if required + if upgradeflavors: + if srcfileh.format_version.startswith("1"): + # Remove original flavor in case the source file has 1.x format + dstnode.del_attr('FLAVOR') + elif srcfileh.format_version < "2.1": + if dstnode.get_attr('FLAVOR') in numpy_aliases: + dstnode.set_attr('FLAVOR', tb.flavor.internal_flavor) + + # Recreate possible old indexes in destination node + if srcnode._c_classid == "TABLE": + recreate_indexes(srcnode, dstfileh, dstnode) + + # Close all the open files: + srcfileh.close() + dstfileh.close() + + +def copy_children(srcfile, dstfile, srcgroup, dstgroup, title, + recursive, filters, copyuserattrs, overwritefile, + overwrtnodes, stats, start, stop, step, + chunkshape, sortby, check_CSI, propindexes, + upgradeflavors, allow_padding, use_hardlinks=True): + """Copy the children from source group to destination group""" + # Open the source file with srcgroup as root_uep + srcfileh = tb.open_file(srcfile, 'r', root_uep=srcgroup, + allow_padding=allow_padding) + # Assign the root to srcgroup + srcgroup = srcfileh.root + + created_dstgroup = False + # Check whether the destination group exists or not + if Path(dstfile).is_file() and not overwritefile: + dstfileh = tb.open_file(dstfile, 'a', + pytables_sys_attrs=createsysattrs, + allow_padding=allow_padding) + try: + dstgroup = dstfileh.get_node(dstgroup) + except tb.exceptions.NoSuchNodeError: + # The dstgroup does not seem to exist. Try creating it. + dstgroup = newdst_group(dstfileh, dstgroup, title, filters) + created_dstgroup = True + else: + # The node exists, but it is really a group? + if not isinstance(dstgroup, tb.group.Group): + # No. Should we overwrite it? + if overwrtnodes: + parent = dstgroup._v_parent + last_slash = dstgroup._v_pathname.rindex('/') + dstgroupname = dstgroup._v_pathname[last_slash + 1:] + dstgroup.remove() + dstgroup = dstfileh.create_group(parent, dstgroupname, + title=title, + filters=filters) + else: + raise RuntimeError("Please check that the node names are " + "not duplicated in destination, and " + "if so, add the --overwrite-nodes " + "flag if desired.") + else: + # The destination file does not exist or will be overwritten. + dstfileh = tb.open_file(dstfile, 'w', title=title, filters=filters, + pytables_sys_attrs=createsysattrs, + allow_padding=allow_padding) + dstgroup = newdst_group(dstfileh, dstgroup, title="", filters=filters) + created_dstgroup = True + + # Copy the attributes to dstgroup, if needed + if created_dstgroup and copyuserattrs: + srcgroup._v_attrs._f_copy(dstgroup) + + # Finally, copy srcgroup children to dstgroup + try: + srcgroup._f_copy_children( + dstgroup, recursive=recursive, filters=filters, + copyuserattrs=copyuserattrs, overwrite=overwrtnodes, + stats=stats, start=start, stop=stop, step=step, + chunkshape=chunkshape, + sortby=sortby, check_CSI=check_CSI, propindexes=propindexes, + use_hardlinks=use_hardlinks) + except Exception: + (type_, value, traceback) = sys.exc_info() + print("Problems doing the copy from '%s:%s' to '%s:%s'" % + (srcfile, srcgroup, dstfile, dstgroup)) + print(f"The error was --> {type_}: {value}") + print("The destination file looks like:\n", dstfileh) + # Close all the open files: + srcfileh.close() + dstfileh.close() + raise RuntimeError("Please check that the node names are not " + "duplicated in destination, and if so, add the " + "--overwrite-nodes flag if desired. In " + "particular, pay attention that root_uep is not " + "fooling you.") + + # Upgrade flavors in dstnode, if required + if upgradeflavors: + for dstnode in dstgroup._f_walknodes("Leaf"): + if srcfileh.format_version.startswith("1"): + # Remove original flavor in case the source file has 1.x format + dstnode.del_attr('FLAVOR') + elif srcfileh.format_version < "2.1": + if dstnode.get_attr('FLAVOR') in numpy_aliases: + dstnode.set_attr('FLAVOR', tb.flavor.internal_flavor) + + # Convert the remaining tables with old indexes (if any) + for table in srcgroup._f_walknodes("Table"): + dsttable = dstfileh.get_node(dstgroup, table._v_pathname) + recreate_indexes(table, dstfileh, dsttable) + + # Close all the open files: + srcfileh.close() + dstfileh.close() + + +def _get_parser(): + parser = argparse.ArgumentParser( + description='''This utility is very powerful and lets you copy any + leaf, group or complete subtree into another file. + During the copy process you are allowed to change the filter + properties if you want so. Also, in the case of duplicated pathnames, + you can decide if you want to overwrite already existing nodes on the + destination file. Generally speaking, ptrepack can be useful in may + situations, like replicating a subtree in another file, change the + filters in objects and see how affect this to the compression degree + or I/O performance, consolidating specific data in repositories or + even *importing* generic HDF5 files and create true PyTables + counterparts.''') + + parser.add_argument( + '-v', '--verbose', action='store_true', + help='show verbose information', + ) + parser.add_argument( + '-o', '--overwrite', action='store_true', dest='overwritefile', + help='overwrite destination file', + ) + parser.add_argument( + '-R', '--range', dest='rng', metavar='RANGE', + help='''select a RANGE of rows (in the form "start,stop,step") + during the copy of *all* the leaves. + Default values are "None,None,1", which means a copy of all the + rows.''', + ) + parser.add_argument( + '--non-recursive', action='store_false', default=True, + dest='recursive', + help='do not do a recursive copy. Default is to do it', + ) + parser.add_argument( + '--dest-title', dest='title', default='', + help='title for the new file (if not specified, the source is copied)', + ) + parser.add_argument( + '--dont-create-sysattrs', action='store_false', default=True, + dest='createsysattrs', + help='do not create sys attrs (default is to do it)', + ) + parser.add_argument( + '--dont-copy-userattrs', action='store_false', default=True, + dest='copyuserattrs', + help='do not copy the user attrs (default is to do it)', + ) + parser.add_argument( + '--overwrite-nodes', action='store_true', dest='overwrtnodes', + help='''overwrite destination nodes if they exist. + Default is to not overwrite them''', + ) + parser.add_argument( + '--complevel', type=int, default=0, + help='''set a compression level (0 for no compression, which is the + default)''', + ) + parser.add_argument( + '--complib', choices=( + "zlib", "lzo", "bzip2", "blosc", "blosc:blosclz", + "blosc:lz4", "blosc:lz4hc", "blosc:snappy", + "blosc:zlib", "blosc:zstd"), default='zlib', + help='''set the compression library to be used during the copy. + Defaults to %(default)s''', + ) + parser.add_argument( + '--shuffle', type=int, choices=(0, 1), + help='''activate or not the shuffle filter (default is active if + complevel > 0)''', + ) + parser.add_argument( + '--bitshuffle', type=int, choices=(0, 1), + help='activate or not the bitshuffle filter (not active by default)', + ) + parser.add_argument( + '--fletcher32', type=int, choices=(0, 1), + help='''whether to activate or not the fletcher32 filter (not active + by default)''', + ) + parser.add_argument( + '--keep-source-filters', action='store_true', dest='keepfilters', + help='''use the original filters in source files. + The default is not doing that if any of --complevel, --complib, + --shuffle --bitshuffle or --fletcher32 option is specified''', + ) + parser.add_argument( + '--chunkshape', default='keep', + help='''set a chunkshape. + Possible options are: "keep" | "auto" | int | tuple. + A value of "auto" computes a sensible value for the chunkshape of the + leaves copied. The default is to "keep" the original value''', + ) + parser.add_argument( + '--upgrade-flavors', action='store_true', dest='upgradeflavors', + help='''when repacking PyTables 1.x or PyTables 2.x files, the flavor + of leaves will be unset. With this, such a leaves will be serialized + as objects with the internal flavor ('numpy' for 3.x series)''', + ) + parser.add_argument( + '--dont-regenerate-old-indexes', action='store_false', default=True, + dest='regoldindexes', + help='''disable regenerating old indexes. + The default is to regenerate old indexes as they are found''', + ) + parser.add_argument( + '--sortby', metavar='COLUMN', + help='''do a table copy sorted by the index in "column". + For reversing the order, use a negative value in the "step" part of + "RANGE" (see "-r" flag). Only applies to table objects''', + ) + parser.add_argument( + '--checkCSI', action='store_true', + help='force the check for a CSI index for the --sortby column', + ) + parser.add_argument( + '--propindexes', action='store_true', + help='''propagate the indexes existing in original tables. The default + is to not propagate them. Only applies to table objects''', + ) + parser.add_argument( + '--dont-allow-padding', action='store_true', + dest="dont_allow_padding", + help='''remove the possible padding in compound types in source files. + The default is to propagate it. Only applies to table objects''', + ) + parser.add_argument( + 'src', metavar='sourcefile:sourcegroup', help='source file/group', + ) + parser.add_argument( + 'dst', metavar='destfile:destgroup', help='destination file/group', + ) + + return parser + + +def main(): + global verbose + global regoldindexes + global createsysattrs + + parser = _get_parser() + args = parser.parse_args() + + # check arguments + if args.rng: + try: + args.rng = eval("slice(" + args.rng + ")") + except Exception: + parser.error("Error when getting the range parameter.") + + if args.chunkshape.isdigit() or args.chunkshape.startswith('('): + args.chunkshape = eval(args.chunkshape) + + if args.complevel < 0 or args.complevel > 9: + parser.error( + 'invalid "complevel" value, it sould be in te range [0, 9]' + ) + + # Catch the files passed as the last arguments + src = args.src.rsplit(':', 1) + dst = args.dst.rsplit(':', 1) + if len(src) == 1: + srcfile, srcnode = src[0], "/" + else: + srcfile, srcnode = src + if len(dst) == 1: + dstfile, dstnode = dst[0], "/" + else: + dstfile, dstnode = dst + + if srcnode == "": + # case where filename == "filename:" instead of "filename:/" + srcnode = "/" + + if dstnode == "": + # case where filename == "filename:" instead of "filename:/" + dstnode = "/" + + # Ignore the warnings for tables that contains oldindexes + # (these will be handled by the copying routines) + warnings.filterwarnings("ignore", category=tb.exceptions.OldIndexWarning) + + # Ignore the flavors warnings during upgrading flavor operations + if args.upgradeflavors: + warnings.filterwarnings("ignore", category=tb.exceptions.FlavorWarning) + + # Build the Filters instance + filter_params = ( + args.complevel, + args.complib, + args.shuffle, + args.bitshuffle, + args.fletcher32, + ) + if (filter_params == (None,) * 4 or args.keepfilters): + filters = None + else: + if args.complevel is None: + args.complevel = 0 + if args.shuffle is None: + if args.complevel > 0: + args.shuffle = True + else: + args.shuffle = False + if args.bitshuffle is None: + args.bitshuffle = False + if args.bitshuffle: + # Shuffle and bitshuffle are mutually exclusive + args.shuffle = False + if args.complib is None: + args.complib = "zlib" + if args.fletcher32 is None: + args.fletcher32 = False + filters = tb.leaf.Filters(complevel=args.complevel, + complib=args.complib, shuffle=args.shuffle, + bitshuffle=args.bitshuffle, + fletcher32=args.fletcher32) + + # The start, stop and step params: + start, stop, step = None, None, 1 # Defaults + if args.rng: + start, stop, step = args.rng.start, args.rng.stop, args.rng.step + + # Set globals + verbose = args.verbose + regoldindexes = args.regoldindexes + createsysattrs = args.createsysattrs + + # Some timing + t1 = clock() + cpu1 = cpuclock() + # Copy the file + if verbose: + print("+=+" * 20) + print("Recursive copy:", args.recursive) + print("Applying filters:", filters) + if args.sortby is not None: + print("Sorting table(s) by column:", args.sortby) + print("Forcing a CSI creation:", args.checkCSI) + if args.propindexes: + print("Recreating indexes in copied table(s)") + print(f"Start copying {srcfile}:{srcnode} to {dstfile}:{dstnode}") + print("+=+" * 20) + + allow_padding = not args.dont_allow_padding + # Check whether the specified source node is a group or a leaf + h5srcfile = tb.open_file(srcfile, 'r', allow_padding=allow_padding) + srcnodeobject = h5srcfile.get_node(srcnode) + + # Close the file again + h5srcfile.close() + + stats = {'groups': 0, 'leaves': 0, 'links': 0, 'bytes': 0, 'hardlinks': 0} + if isinstance(srcnodeobject, tb.group.Group): + copy_children( + srcfile, dstfile, srcnode, dstnode, + title=args.title, recursive=args.recursive, filters=filters, + copyuserattrs=args.copyuserattrs, overwritefile=args.overwritefile, + overwrtnodes=args.overwrtnodes, stats=stats, + start=start, stop=stop, step=step, chunkshape=args.chunkshape, + sortby=args.sortby, check_CSI=args.checkCSI, + propindexes=args.propindexes, + upgradeflavors=args.upgradeflavors, + allow_padding=allow_padding, + use_hardlinks=True) + else: + # If not a Group, it should be a Leaf + copy_leaf( + srcfile, dstfile, srcnode, dstnode, + title=args.title, filters=filters, + copyuserattrs=args.copyuserattrs, + overwritefile=args.overwritefile, overwrtnodes=args.overwrtnodes, + stats=stats, start=start, stop=stop, step=step, + chunkshape=args.chunkshape, + sortby=args.sortby, check_CSI=args.checkCSI, + propindexes=args.propindexes, + upgradeflavors=args.upgradeflavors, + allow_padding=allow_padding, + ) + + # Gather some statistics + t2 = clock() + cpu2 = cpuclock() + tcopy = t2 - t1 + cpucopy = cpu2 - cpu1 + if verbose: + ngroups = stats['groups'] + nleaves = stats['leaves'] + nlinks = stats['links'] + nhardlinks = stats['hardlinks'] + nbytescopied = stats['bytes'] + nnodes = ngroups + nleaves + nlinks + nhardlinks + + print( + "Groups copied:", ngroups, + ", Leaves copied:", nleaves, + ", Links copied:", nlinks, + ", Hard links copied:", nhardlinks, + ) + if args.copyuserattrs: + print("User attrs copied") + else: + print("User attrs not copied") + print(f"KBytes copied: {nbytescopied / 1024:.3f}") + print( + f"Time copying: {tcopy:.3f} s (real) {cpucopy:.3f} s " + f"(cpu) {cpucopy / tcopy:.0%}") + print(f"Copied nodes/sec: {nnodes / tcopy:.1f}") + print(f"Copied KB/s : {nbytescopied / tcopy / 1024:.0f}") diff --git a/tables/scripts/pttree.py b/tables/scripts/pttree.py new file mode 100644 index 0000000..f1495a4 --- /dev/null +++ b/tables/scripts/pttree.py @@ -0,0 +1,455 @@ +"""This utility prints the contents of an HDF5 file as a tree. + +Pass the flag -h to this for help on usage. + +""" + +import os +import argparse +from collections import defaultdict, deque +import warnings +from pathlib import Path + +import numpy as np +import tables as tb + + +def _get_parser(): + parser = argparse.ArgumentParser( + description=''' + `pttree` is designed to give a quick overview of the contents of a + PyTables HDF5 file by printing a depth-indented list of nodes, similar + to the output of the Unix `tree` function. + + It can also display the size, shape and compression states of + individual nodes, as well as summary information for the whole file. + + For a more verbose output (including metadata), see `ptdump`. + ''') + + parser.add_argument( + '-L', '--max-level', type=int, dest='max_depth', + help='maximum branch depth of tree to display (-1 == no limit)', + ) + parser.add_argument( + '-S', '--sort-by', type=str, dest='sort_by', + help='artificially order nodes, can be either "size", "name" or "none"' + ) + parser.add_argument( + '--print-size', action='store_true', dest='print_size', + help='print size of each node/branch', + ) + parser.add_argument( + '--no-print-size', action='store_false', dest='print_size', + ) + parser.add_argument( + '--print-shape', action='store_true', dest='print_shape', + help='print shape of each node', + ) + parser.add_argument( + '--no-print-shape', action='store_false', dest='print_shape', + ) + parser.add_argument( + '--print-compression', action='store_true', dest='print_compression', + help='print compression library(level) for each compressed node', + ) + parser.add_argument( + '--no-print-compression', action='store_false', + dest='print_compression', + ) + parser.add_argument( + '--print-percent', action='store_true', dest='print_percent', + help='print size of each node as a %% of the total tree size on disk', + ) + parser.add_argument( + '--no-print-percent', action='store_false', + dest='print_percent', + ) + parser.add_argument( + '--use-si-units', action='store_true', dest='use_si_units', + help='report sizes in SI units (1 MB == 10^6 B)', + ) + parser.add_argument( + '--use-binary-units', action='store_false', dest='use_si_units', + help='report sizes in binary units (1 MiB == 2^20 B)', + ) + + parser.add_argument('src', metavar='filename[:nodepath]', + help='path to the root of the tree structure') + + parser.set_defaults(max_depth=1, sort_by="size", print_size=True, + print_percent=True, print_shape=False, + print_compression=False, use_si_units=False) + + return parser + + +def main(): + + parser = _get_parser() + args = parser.parse_args() + + # Catch the files passed as the last arguments + src = args.__dict__.pop('src').rsplit(':', 1) + if len(src) == 1: + filename, nodename = src[0], "/" + else: + filename, nodename = src + if nodename == "": + # case where filename == "filename:" instead of "filename:/" + nodename = "/" + + with tb.open_file(filename, 'r') as f: + tree_str = get_tree_str(f, nodename, **args.__dict__) + print(tree_str) + + pass + + +def get_tree_str(f, where='/', max_depth=-1, print_class=True, + print_size=True, print_percent=True, print_shape=False, + print_compression=False, print_total=True, sort_by=None, + use_si_units=False): + """ + Generate the ASCII string representing the tree structure, and the summary + info (if requested) + """ + + root = f.get_node(where) + root._g_check_open() + start_depth = root._v_depth + if max_depth < 0: + max_depth = os.sys.maxint + + b2h = bytes2human(use_si_units) + + # we will pass over each node in the tree twice + + # on the first pass we'll start at the root node and recurse down the + # branches, finding all of the leaf nodes and calculating the total size + # over all tables and arrays + total_in_mem = 0 + total_on_disk = 0 + total_items = 0 + + # defaultdicts for holding the cumulative branch sizes at each node + in_mem = defaultdict(lambda: 0) + on_disk = defaultdict(lambda: 0) + leaf_count = defaultdict(lambda: 0) + + # keep track of node addresses within the HDF5 file so that we don't count + # nodes with multiple references (i.e. hardlinks) multiple times + ref_count = defaultdict(lambda: 0) + ref_idx = defaultdict(lambda: 0) + hl_addresses = defaultdict(lambda: None) + hl_targets = defaultdict(lambda: '') + + stack = deque(root) + leaves = deque() + + while stack: + + node = stack.pop() + + if isinstance(node, tb.link.Link): + # we treat links like leaves, except we don't dereference them to + # get their sizes or addresses + leaves.append(node) + continue + + path = node._v_pathname + addr, rc = node._get_obj_info() + ref_count[addr] += 1 + ref_idx[path] = ref_count[addr] + hl_addresses[path] = addr + + if isinstance(node, tb.UnImplemented): + leaves.append(node) + + elif isinstance(node, tb.Leaf): + + # only count the size of a hardlinked leaf the first time it is + # visited + if ref_count[addr] == 1: + + try: + m = node.size_in_memory + d = node.size_on_disk + + # size of this node + in_mem[path] += m + on_disk[path] += d + leaf_count[path] += 1 + + # total over all nodes + total_in_mem += m + total_on_disk += d + total_items += 1 + + # arbitrarily treat this node as the 'target' for all other + # hardlinks that point to the same address + hl_targets[addr] = path + + except NotImplementedError as e: + # size_on_disk is not implemented for VLArrays + warnings.warn(str(e)) + + # push leaf nodes onto the stack for the next pass + leaves.append(node) + + elif isinstance(node, tb.Group): + + # don't recurse down the same hardlinked branch multiple times! + if ref_count[addr] == 1: + stack.extend(list(node._v_children.values())) + hl_targets[addr] = path + + # if we've already visited this group's address, treat it as a leaf + # instead + else: + leaves.append(node) + + # on the second pass we start at each leaf and work upwards towards the + # root node, computing the cumulative size of each branch at each node, and + # instantiating a PrettyTree object for each node to create an ASCII + # representation of the tree structure + + # this will store the PrettyTree objects for every node we're printing + pretty = {} + + stack = leaves + + while stack: + + node = stack.pop() + path = node._v_pathname + + parent = node._v_parent + parent_path = parent._v_pathname + + # cumulative size at parent node + in_mem[parent_path] += in_mem[path] + on_disk[parent_path] += on_disk[path] + leaf_count[parent_path] += leaf_count[path] + + depth = node._v_depth - start_depth + + # if we're deeper than the max recursion depth, we print nothing + if not depth > max_depth: + + # create a PrettyTree representation of this node + name = node._v_name + if print_class: + name += " (%s)" % node.__class__.__name__ + + labels = [] + ratio = on_disk[path] / total_on_disk + + # if the address of this object has a ref_count > 1, it has + # multiple hardlinks + if ref_count[hl_addresses[path]] > 1: + name += ', addr=%i, ref=%i/%i' % ( + hl_addresses[path], ref_idx[path], + ref_count[hl_addresses[path]] + ) + + if isinstance(node, tb.link.Link): + labels.append('softlink --> %s' % node.target) + + elif ref_idx[path] > 1: + labels.append('hardlink --> %s' + % hl_targets[hl_addresses[path]]) + + elif isinstance(node, (tb.Array, tb.Table)): + + if print_size: + sizestr = 'mem={}, disk={}'.format( + b2h(in_mem[path]), b2h(on_disk[path])) + if print_percent: + sizestr += f' [{ratio:5.1%}]' + labels.append(sizestr) + + if print_shape: + labels.append('shape=%s' % repr(node.shape)) + + if print_compression: + lib = node.filters.complib + level = node.filters.complevel + if level: + compstr = '%s(%i)' % (lib, level) + else: + compstr = 'None' + labels.append('compression=%s' % compstr) + + # if we're at our max recursion depth, we'll print summary + # information for this branch + elif depth == max_depth: + itemstr = '... %i leaves' % leaf_count[path] + if print_size: + itemstr += ', mem={}, disk={}'.format( + b2h(in_mem[path]), b2h(on_disk[path])) + if print_percent: + itemstr += f' [{ratio:5.1%}]' + labels.append(itemstr) + + # create a PrettyTree for this node, if one doesn't exist already + if path not in pretty: + pretty.update({path: PrettyTree()}) + pretty[path].name = name + pretty[path].labels = labels + if sort_by == 'size': + # descending size order + pretty[path].sort_by = -ratio + elif sort_by == 'name': + pretty[path].sort_by = node._v_name + else: + # natural order + if path == '/': + # root is not in root._v_children + pretty[path].sort_by = 0 + else: + pretty[path].sort_by = list(parent._v_children.values( + )).index(node) + + # exclude root node or we'll get infinite recursions (since '/' is + # the parent of '/') + if path != '/': + + # create a PrettyTree for the parent of this node, if one + # doesn't exist already + if parent_path not in pretty: + pretty.update({parent_path: PrettyTree()}) + + # make this PrettyTree a child of the parent PrettyTree + pretty[parent_path].add_child(pretty[path]) + + if node is not root and parent not in stack: + # we append to the 'bottom' of the stack, so that we exhaust all of + # the nodes at this level before going up a level in the tree + stack.appendleft(parent) + + out_str = '\n' + '-' * 60 + '\n' * 2 + out_str += str(pretty[root._v_pathname]) + '\n' * 2 + + if print_total: + avg_ratio = total_on_disk / total_in_mem + fsize = Path(f.filename).stat().st_size + + out_str += '-' * 60 + '\n' + out_str += 'Total branch leaves: %i\n' % total_items + out_str += 'Total branch size: {} in memory, {} on disk\n'.format( + b2h(total_in_mem), b2h(total_on_disk)) + out_str += 'Mean compression ratio: %.2f\n' % avg_ratio + out_str += 'HDF5 file size: %s\n' % b2h(fsize) + out_str += '-' * 60 + '\n' + + return out_str + + +class PrettyTree: + + """ + + A pretty ASCII representation of a recursive tree structure. Each node can + have multiple labels, given as a list of strings. + + Example: + -------- + + A = PrettyTree('A', labels=['wow']) + B = PrettyTree('B', labels=['such tree']) + C = PrettyTree('C', children=[A, B]) + D = PrettyTree('D', labels=['so recursive']) + root = PrettyTree('root', labels=['many nodes'], children=[C, D]) + print root + + Credit to Andrew Cooke's blog: + + + """ + + def __init__(self, name=None, children=None, labels=None, sort_by=None): + + # NB: do NOT assign default list/dict arguments in the function + # declaration itself - these objects are shared between ALL instances + # of PrettyTree, and by assigning to them it's easy to get into + # infinite recursions, e.g. when 'self in self.children == True' + if children is None: + children = [] + if labels is None: + labels = [] + + self.name = name + self.children = children + self.labels = labels + self.sort_by = sort_by + + def add_child(self, child): + # some basic checks to help to avoid infinite recursion + assert child is not self + assert self not in child.children + if child not in self.children: + self.children.append(child) + + def tree_lines(self): + yield self.name + for label in self.labels: + yield ' ' + label + children = sorted(self.children, key=(lambda c: c.sort_by)) + last = children[-1] if children else None + for child in children: + prefix = '`--' if child is last else '+--' + for line in child.tree_lines(): + yield prefix + line + prefix = ' ' if child is last else '| ' + + def __str__(self): + return "\n".join(self.tree_lines()) + + def __repr__(self): + return f'<{self.__class__.__name__} at 0x{id(self):x}>' + + +def bytes2human(use_si_units=False): + + if use_si_units: + prefixes = 'TB', 'GB', 'MB', 'kB', 'B' + values = 1E12, 1E9, 1E6, 1E3, 1 + else: + prefixes = 'TiB', 'GiB', 'MiB', 'KiB', 'B' + values = 2 ** 40, 2 ** 30, 2 ** 20, 2 ** 10, 1 + + def b2h(nbytes): + + for (prefix, value) in zip(prefixes, values): + scaled = nbytes / value + if scaled >= 1: + break + + return f"{scaled:.1f}{prefix}" + + return b2h + + +def make_test_file(prefix='/tmp'): + f = tb.open_file(str(Path(prefix) / 'test_pttree.hdf5'), 'w') + + g1 = f.create_group('/', 'group1') + g1a = f.create_group(g1, 'group1a') + g1b = f.create_group(g1, 'group1b') + + filters = tb.Filters(complevel=5, complib='bzip2') + + for gg in g1a, g1b: + f.create_carray(gg, 'zeros128b', obj=np.zeros(32, dtype=np.float64), + filters=filters) + f.create_carray(gg, 'random128b', obj=np.random.rand(32), + filters=filters) + + g2 = f.create_group('/', 'group2') + + f.create_soft_link(g2, 'softlink_g1_z128', '/group1/group1a/zeros128b') + f.create_hard_link(g2, 'hardlink_g1a_z128', '/group1/group1a/zeros128b') + f.create_hard_link(g2, 'hardlink_g1a', '/group1/group1a') + + return f diff --git a/tables/table.py b/tables/table.py new file mode 100644 index 0000000..177d11d --- /dev/null +++ b/tables/table.py @@ -0,0 +1,3674 @@ +"""Here is defined the Table class.""" + +import functools +import math +import operator +import sys +import warnings +from pathlib import Path + +from time import perf_counter as clock + +import numexpr as ne +import numpy as np + +from . import tableextension +from .lrucacheextension import ObjectCache, NumCache +from .atom import Atom +from .conditions import compile_condition +from .flavor import flavor_of, array_as_internal, internal_to_flavor +from .utils import is_idx, lazyattr, SizeType, NailedDict as CacheDict +from .leaf import Leaf +from .description import (IsDescription, Description, Col, descr_from_dtype) +from .exceptions import ( + NodeError, HDF5ExtError, PerformanceWarning, OldIndexWarning, + NoSuchNodeError) +from .utilsextension import get_nested_field + +from .path import join_path, split_path +from .index import ( + OldIndex, default_index_filters, default_auto_index, Index, IndexesDescG, + IndexesTableG) + + +profile = False +# profile = True # Uncomment for profiling +if profile: + from .utils import show_stats + + +# 2.2: Added support for complex types. Introduced in version 0.9. +# 2.2.1: Added suport for time types. +# 2.3: Changed the indexes naming schema. +# 2.4: Changed indexes naming schema (again). +# 2.5: Added the FIELD_%d_FILL attributes. +# 2.6: Added the FLAVOR attribute (optional). +# 2.7: Numeric and numarray flavors are gone. +obversion = "2.7" # The Table VERSION number + + +# Maps NumPy types to the types used by Numexpr. +_nxtype_from_nptype = { + np.bool_: bool, + np.int8: ne.necompiler.int_, + np.int16: ne.necompiler.int_, + np.int32: ne.necompiler.int_, + np.int64: ne.necompiler.long_, + np.uint8: ne.necompiler.int_, + np.uint16: ne.necompiler.int_, + np.uint32: ne.necompiler.long_, + np.uint64: ne.necompiler.long_, + np.float32: float, + np.float64: ne.necompiler.double, + np.complex64: complex, + np.complex128: complex, + np.bytes_: bytes, +} + +_nxtype_from_nptype[np.str_] = str + +if hasattr(np, 'float16'): + _nxtype_from_nptype[np.float16] = float # XXX: check +if hasattr(np, 'float96'): + _nxtype_from_nptype[np.float96] = ne.necompiler.double # XXX: check +if hasattr(np, 'float128'): + _nxtype_from_nptype[np.float128] = ne.necompiler.double # XXX: check +if hasattr(np, 'complex192'): + _nxtype_from_nptype[np.complex192] = complex # XXX: check +if hasattr(np, 'complex256'): + _nxtype_from_nptype[np.complex256] = complex # XXX: check + + +# The NumPy scalar type corresponding to `SizeType`. +_npsizetype = np.array(SizeType(0)).dtype.type + + +def _index_name_of(node): + return '_i_%s' % node._v_name + + +def _index_pathname_of(node): + nodeParentPath = split_path(node._v_pathname)[0] + return join_path(nodeParentPath, _index_name_of(node)) + + +def _index_pathname_of_column(table, colpathname): + return join_path(_index_pathname_of(table), colpathname) + + +# The next are versions that work with just paths (i.e. we don't need +# a node instance for using them, which can be critical in certain +# situations) + + +def _index_name_of_(nodeName): + return '_i_%s' % nodeName + + +def _index_pathname_of_(nodePath): + nodeParentPath, nodeName = split_path(nodePath) + return join_path(nodeParentPath, _index_name_of_(nodeName)) + + +def _index_pathname_of_column_(tablePath, colpathname): + return join_path(_index_pathname_of_(tablePath), colpathname) + + +def restorecache(self): + # Define a cache for sparse table reads + params = self._v_file.params + chunksize = self._v_chunkshape[0] + nslots = params['TABLE_MAX_SIZE'] / (chunksize * self._v_dtype.itemsize) + self._chunkcache = NumCache((nslots, chunksize), self._v_dtype, + 'table chunk cache') + self._seqcache = ObjectCache(params['ITERSEQ_MAX_SLOTS'], + params['ITERSEQ_MAX_SIZE'], + 'Iter sequence cache') + self._dirtycache = False + + +def _table__where_indexed(self, compiled, condition, condvars, + start, stop, step): + if profile: + tref = clock() + if profile: + show_stats("Entering table_whereIndexed", tref) + self._use_index = True + # Clean the table caches for indexed queries if needed + if self._dirtycache: + restorecache(self) + + # Get the values in expression that are not columns + values = [] + for key, value in condvars.items(): + if isinstance(value, np.ndarray): + values.append((key, value.item())) + # Build a key for the sequence cache + seqkey = (condition, tuple(values), (start, stop, step)) + # Do a lookup in sequential cache for this query + nslot = self._seqcache.getslot(seqkey) + if nslot >= 0: + # Get the row sequence from the cache + seq = self._seqcache.getitem(nslot) + if len(seq) == 0: + return iter([]) + # seq is a list. + seq = np.array(seq, dtype='int64') + # Correct the ranges in cached sequence + if (start, stop, step) != (0, self.nrows, 1): + seq = seq[(seq >= start) & ( + seq < stop) & ((seq - start) % step == 0)] + return self.itersequence(seq) + else: + # No luck. self._seqcache will be populated + # in the iterator if possible. (Row._finish_riterator) + self._seqcache_key = seqkey + + # Compute the chunkmap for every index in indexed expression + idxexprs = compiled.index_expressions + strexpr = compiled.string_expression + cmvars = {} + tcoords = 0 + for i, idxexpr in enumerate(idxexprs): + var, ops, lims = idxexpr + col = condvars[var] + index = col.index + assert index is not None, "the chosen column is not indexed" + assert not index.dirty, "the chosen column has a dirty index" + + # Get the number of rows that the indexed condition yields. + range_ = index.get_lookup_range(ops, lims) + ncoords = index.search(range_) + tcoords += ncoords + if index.reduction == 1 and ncoords == 0: + # No values from index condition, thus the chunkmap should be empty + nrowsinchunk = self.chunkshape[0] + nchunks = math.ceil(self.nrows / nrowsinchunk) + chunkmap = np.zeros(shape=nchunks, dtype="bool") + else: + # Get the chunkmap from the index + chunkmap = index.get_chunkmap() + # Assign the chunkmap to the cmvars dictionary + cmvars["e%d" % i] = chunkmap + + if index.reduction == 1 and tcoords == 0: + # No candidates found in any indexed expression component, so leave now + self._seqcache.setitem(seqkey, [], 1) + return iter([]) + + # Compute the final chunkmap + chunkmap = ne.evaluate(strexpr, cmvars) + if not chunkmap.any(): + # The chunkmap is all False, so the result is empty + self._seqcache.setitem(seqkey, [], 1) + return iter([]) + + if profile: + show_stats("Exiting table_whereIndexed", tref) + return chunkmap + + +def create_indexes_table(table): + itgroup = IndexesTableG( + table._v_parent, _index_name_of(table), + "Indexes container for table " + table._v_pathname, new=True) + return itgroup + + +def create_indexes_descr(igroup, dname, iname, filters): + idgroup = IndexesDescG( + igroup, iname, + "Indexes container for sub-description " + dname, + filters=filters, new=True) + return idgroup + + +def _column__create_index(self, optlevel, kind, filters, tmp_dir, + blocksizes, verbose): + name = self.name + table = self.table + dtype = self.dtype + descr = self.descr + index = self.index + get_node = table._v_file._get_node + + # Warn if the index already exists + if index: + raise ValueError("%s for column '%s' already exists. If you want to " + "re-create it, please, try with reindex() method " + "better" % (str(index), str(self.pathname))) + + # Check that the datatype is indexable. + if dtype.str[1:] == 'u8': + raise NotImplementedError( + "indexing 64-bit unsigned integer columns " + "is not supported yet, sorry") + if dtype.kind == 'c': + raise TypeError("complex columns can not be indexed") + if dtype.shape != (): + raise TypeError("multidimensional columns can not be indexed") + + # Get the indexes group for table, and if not exists, create it + try: + itgroup = get_node(_index_pathname_of(table)) + except NoSuchNodeError: + itgroup = create_indexes_table(table) + + # Create the necessary intermediate groups for descriptors + idgroup = itgroup + dname = "" + pathname = descr._v_pathname + if pathname != '': + inames = pathname.split('/') + for iname in inames: + if dname == '': + dname = iname + else: + dname += '/' + iname + try: + idgroup = get_node(f'{itgroup._v_pathname}/{dname}') + except NoSuchNodeError: + idgroup = create_indexes_descr(idgroup, dname, iname, filters) + + # Create the atom + assert dtype.shape == () + atom = Atom.from_dtype(np.dtype((dtype, (0,)))) + + # Protection on tables larger than the expected rows (perhaps the + # user forgot to pass this parameter to the Table constructor?) + expectedrows = table._v_expectedrows + if table.nrows > expectedrows: + expectedrows = table.nrows + + # Create the index itself + index = Index( + idgroup, name, atom=atom, + title="Index for %s column" % name, + kind=kind, + optlevel=optlevel, + filters=filters, + tmp_dir=tmp_dir, + expectedrows=expectedrows, + byteorder=table.byteorder, + blocksizes=blocksizes) + + table._set_column_indexing(self.pathname, True) + + # Feed the index with values + + # Add rows to the index if necessary + if table.nrows > 0: + indexedrows = table._add_rows_to_index( + self.pathname, 0, table.nrows, lastrow=True, update=False) + else: + indexedrows = 0 + index.dirty = False + table._indexedrows = indexedrows + table._unsaved_indexedrows = table.nrows - indexedrows + + # Optimize the index that has been already filled-up + index.optimize(verbose=verbose) + + # We cannot do a flush here because when reindexing during a + # flush, the indexes are created anew, and that creates a nested + # call to flush(). + # table.flush() + + return indexedrows + + +class _ColIndexes(dict): + """Provides a nice representation of column indexes.""" + + def __repr__(self): + """Gives a detailed Description column representation.""" + + rep = [f' \"{k}\": {v}' for k, v in self.items()] + return '{\n %s}' % (',\n '.join(rep)) + + +class Table(tableextension.Table, Leaf): + """This class represents heterogeneous datasets in an HDF5 file. + + Tables are leaves (see the Leaf class in :ref:`LeafClassDescr`) whose data + consists of a unidimensional sequence of *rows*, where each row contains + one or more *fields*. Fields have an associated unique *name* and + *position*, with the first field having position 0. All rows have the same + fields, which are arranged in *columns*. + + Fields can have any type supported by the Col class (see + :ref:`ColClassDescr`) and its descendants, which support multidimensional + data. Moreover, a field can be *nested* (to an arbitrary depth), meaning + that it includes further fields inside. A field named x inside a nested + field a in a table can be accessed as the field a/x (its *path name*) from + the table. + + The structure of a table is declared by its description, which is made + available in the Table.description attribute (see :class:`Table`). + + This class provides new methods to read, write and search table data + efficiently. It also provides special Python methods to allow accessing + the table as a normal sequence or array (with extended slicing supported). + + PyTables supports *in-kernel* searches working simultaneously on several + columns using complex conditions. These are faster than selections using + Python expressions. See the :meth:`Table.where` method for more + information on in-kernel searches. + + Non-nested columns can be *indexed*. Searching an indexed column can be + several times faster than searching a non-nested one. Search methods + automatically take advantage of indexing where available. + + When iterating a table, an object from the Row (see :ref:`RowClassDescr`) + class is used. This object allows to read and write data one row at a + time, as well as to perform queries which are not supported by in-kernel + syntax (at a much lower speed, of course). + + Objects of this class support access to individual columns via *natural + naming* through the :attr:`Table.cols` accessor. Nested columns are + mapped to Cols instances, and non-nested ones to Column instances. + See the Column class in :ref:`ColumnClassDescr` for examples of this + feature. + + Parameters + ---------- + parentnode + The parent :class:`Group` object. + + .. versionchanged:: 3.0 + Renamed from *parentNode* to *parentnode*. + + name : str + The name of this node in its parent group. + description + An IsDescription subclass or a dictionary where the keys are the field + names, and the values the type definitions. In addition, a pure NumPy + dtype is accepted. If None, the table metadata is read from disk, + else, it's taken from previous parameters. + title + Sets a TITLE attribute on the HDF5 table entity. + filters : Filters + An instance of the Filters class that provides information about the + desired I/O filters to be applied during the life of this object. + expectedrows + A user estimate about the number of rows that will be on table. If not + provided, the default value is ``EXPECTED_ROWS_TABLE`` (see + ``tables/parameters.py``). If you plan to save bigger tables, try + providing a guess; this will optimize the HDF5 B-Tree creation and + management process time and memory used. + chunkshape + The shape of the data chunk to be read or written as a single HDF5 I/O + operation. The filters are applied to those chunks of data. Its rank + for tables has to be 1. If ``None``, a sensible value is calculated + based on the `expectedrows` parameter (which is recommended). + byteorder + The byteorder of the data *on-disk*, specified as 'little' or 'big'. If + this is not specified, the byteorder is that of the platform, unless + you passed a recarray as the `description`, in which case the recarray + byteorder will be chosen. + track_times + Whether time data associated with the leaf are recorded (object + access time, raw data modification time, metadata change time, object + birth time); default True. Semantics of these times depend on their + implementation in the HDF5 library: refer to documentation of the + H5O_info_t data structure. As of HDF5 1.8.15, only ctime (metadata + change time) is implemented. + + .. versionadded:: 3.4.3 + + Notes + ----- + The instance variables below are provided in addition to those in + Leaf (see :ref:`LeafClassDescr`). Please note that there are several + col* dictionaries to ease retrieving information about a column + directly by its path name, avoiding the need to walk through + Table.description or Table.cols. + + + .. rubric:: Table attributes + + .. attribute:: coldescrs + + Maps the name of a column to its Col description (see + :ref:`ColClassDescr`). + + .. attribute:: coldflts + + Maps the name of a column to its default value. + + .. attribute:: coldtypes + + Maps the name of a column to its NumPy data type. + + .. attribute:: colindexed + + Is the column which name is used as a key indexed? + + .. attribute:: colinstances + + Maps the name of a column to its Column (see + :ref:`ColumnClassDescr`) or Cols (see :ref:`ColsClassDescr`) + instance. + + .. attribute:: colnames + + A list containing the names of *top-level* columns in the table. + + .. attribute:: colpathnames + + A list containing the pathnames of *bottom-level* columns in + the table. + + These are the leaf columns obtained when walking the table + description left-to-right, bottom-first. Columns inside a + nested column have slashes (/) separating name components in + their pathname. + + .. attribute:: cols + + A Cols instance that provides *natural naming* access to + non-nested (Column, see :ref:`ColumnClassDescr`) and nested + (Cols, see :ref:`ColsClassDescr`) columns. + + .. attribute:: coltypes + + Maps the name of a column to its PyTables data type. + + .. attribute:: description + + A Description instance (see :ref:`DescriptionClassDescr`) + reflecting the structure of the table. + + .. attribute:: extdim + + The index of the enlargeable dimension (always 0 for tables). + + .. attribute:: indexed + + Does this table have any indexed columns? + + .. attribute:: nrows + + The current number of rows in the table. + + """ + + # Class identifier. + _c_classid = 'TABLE' + + @lazyattr + def row(self): + """The associated Row instance (see :ref:`RowClassDescr`).""" + + return tableextension.Row(self) + + @lazyattr + def dtype(self): + """The NumPy ``dtype`` that most closely matches this table.""" + + return self.description._v_dtype + + @property + def shape(self): + """The shape of this table.""" + return (self.nrows,) + + @property + def rowsize(self): + """The size in bytes of each row in the table.""" + return self.description._v_dtype.itemsize + + @property + def size_in_memory(self): + """The size of this table's data in bytes when it is fully loaded into + memory. This may be used in combination with size_on_disk to calculate + the compression ratio of the data.""" + return self.nrows * self.rowsize + + @lazyattr + def _v_iobuf(self): + """A buffer for doing I/O.""" + + return self._get_container(self.nrowsinbuf) + + @lazyattr + def _v_wdflts(self): + """The defaults for writing in recarray format.""" + + # First, do a check to see whether we need to set default values + # different from 0 or not. + for coldflt in self.coldflts.values(): + if isinstance(coldflt, np.ndarray) or coldflt: + break + else: + # No default different from 0 found. Returning None. + return None + wdflts = self._get_container(1) + for colname, coldflt in self.coldflts.items(): + ra = get_nested_field(wdflts, colname) + ra[:] = coldflt + return wdflts + + @lazyattr + def _colunaligned(self): + """The pathnames of unaligned, *unidimensional* columns.""" + colunaligned, rarr = [], self._get_container(0) + for colpathname in self.colpathnames: + carr = get_nested_field(rarr, colpathname) + if not carr.flags.aligned and carr.ndim == 1: + colunaligned.append(colpathname) + return frozenset(colunaligned) + + # **************** WARNING! *********************** + # This function can be called during the destruction time of a table + # so measures have been taken so that it doesn't have to revive + # another node (which can fool the LRU cache). The solution devised + # has been to add a cache for autoindex (Table._autoindex), populate + # it in creation time of the cache (which is a safe period) and then + # update the cache whenever it changes. + # This solves the error when running test_indexes.py ManyNodesTestCase. + # F. Alted 2007-04-20 + # ************************************************** + + @property + def autoindex(self): + """Automatically keep column indexes up to date? + + Setting this value states whether existing indexes should be + automatically updated after an append operation or recomputed + after an index-invalidating operation (i.e. removal and + modification of rows). The default is true. + + This value gets into effect whenever a column is altered. If you + don't have automatic indexing activated and you want to do an an + immediate update use `Table.flush_rows_to_index()`; for an immediate + reindexing of invalidated indexes, use `Table.reindex_dirty()`. + + This value is persistent. + + .. versionchanged:: 3.0 + The *autoIndex* property has been renamed into *autoindex*. + """ + + if self._autoindex is None: + try: + indexgroup = self._v_file._get_node(_index_pathname_of(self)) + except NoSuchNodeError: + self._autoindex = default_auto_index # update cache + return self._autoindex + else: + self._autoindex = indexgroup.auto # update cache + return self._autoindex + else: + # The value is in cache, return it + return self._autoindex + + @autoindex.setter + def autoindex(self, auto): + auto = bool(auto) + try: + indexgroup = self._v_file._get_node(_index_pathname_of(self)) + except NoSuchNodeError: + indexgroup = create_indexes_table(self) + indexgroup.auto = auto + # Update the cache in table instance as well + self._autoindex = auto + + @property + def indexedcolpathnames(self): + """List of pathnames of indexed columns in the table.""" + return [_colpname + for _colpname in self.colpathnames + if self.colindexed[_colpname]] + + @property + def colindexes(self): + """A dictionary with the indexes of the indexed columns.""" + return _ColIndexes((_colpname, self.cols._f_col(_colpname).index) + for _colpname in self.colpathnames + if self.colindexed[_colpname]) + + @property + def _dirtyindexes(self): + """Whether some index in table is dirty.""" + return self._condition_cache._nailcount > 0 + + def __init__(self, parentnode, name, + description=None, title="", filters=None, + expectedrows=None, chunkshape=None, + byteorder=None, _log=True, track_times=True): + + self._v_new = new = description is not None + """Is this the first time the node has been created?""" + self._v_new_title = title + """New title for this node.""" + self._v_new_filters = filters + """New filter properties for this node.""" + self.extdim = 0 # Tables only have one dimension currently + """The index of the enlargeable dimension (always 0 for tables).""" + self._v_recarray = None + """A structured array to be stored in the table.""" + self._rabyteorder = None + """The computed byteorder of the self._v_recarray.""" + if expectedrows is None: + expectedrows = parentnode._v_file.params['EXPECTED_ROWS_TABLE'] + self._v_expectedrows = expectedrows + """The expected number of rows to be stored in the table.""" + self.nrows = SizeType(0) + """The current number of rows in the table.""" + self.description = None + """A Description instance (see :ref:`DescriptionClassDescr`) + reflecting the structure of the table.""" + self._time64colnames = [] + """The names of ``Time64`` columns.""" + self._strcolnames = [] + """The names of ``String`` columns.""" + self._colenums = {} + """Maps the name of an enumerated column to its ``Enum`` instance.""" + self._v_chunkshape = None + """Private storage for the `chunkshape` property of the leaf.""" + + self.indexed = False + """Does this table have any indexed columns?""" + self._indexedrows = 0 + """Number of rows indexed in disk.""" + self._unsaved_indexedrows = 0 + """Number of rows indexed in memory but still not in disk.""" + self._listoldindexes = [] + """The list of columns with old indexes.""" + self._autoindex = None + """Private variable that caches the value for autoindex.""" + + self.colnames = [] + """A list containing the names of *top-level* columns in the table.""" + self.colpathnames = [] + """A list containing the pathnames of *bottom-level* columns in the + table. + + These are the leaf columns obtained when walking the + table description left-to-right, bottom-first. Columns inside a + nested column have slashes (/) separating name components in + their pathname. + """ + self.colinstances = {} + """Maps the name of a column to its Column (see + :ref:`ColumnClassDescr`) or Cols (see :ref:`ColsClassDescr`) + instance.""" + self.coldescrs = {} + """Maps the name of a column to its Col description (see + :ref:`ColClassDescr`).""" + self.coltypes = {} + """Maps the name of a column to its PyTables data type.""" + self.coldtypes = {} + """Maps the name of a column to its NumPy data type.""" + self.coldflts = {} + """Maps the name of a column to its default value.""" + self.colindexed = {} + """Is the column which name is used as a key indexed?""" + + self._use_index = False + """Whether an index can be used or not in a search. Boolean.""" + self._where_condition = None + """Condition function and argument list for selection of values.""" + self._seqcache_key = None + """The key under which to save a query's results (list of row indexes) + or None to not save.""" + max_slots = parentnode._v_file.params['COND_CACHE_SLOTS'] + self._condition_cache = CacheDict(max_slots) + """Cache of already compiled conditions.""" + self._exprvars_cache = {} + """Cache of variables participating in numexpr expressions.""" + self._enabled_indexing_in_queries = True + """Is indexing enabled in queries? *Use only for testing.*""" + self._empty_array_cache = {} + """Cache of empty arrays.""" + + self._v_dtype = None + """The NumPy datatype fopr this table.""" + self.cols = None + """ + A Cols instance that provides *natural naming* access to non-nested + (Column, see :ref:`ColumnClassDescr`) and nested (Cols, see + :ref:`ColsClassDescr`) columns. + """ + self._dirtycache = True + """Whether the data caches are dirty or not. Initially set to yes.""" + self._descflavor = None + """Temporarily keeps the flavor of a description with data.""" + + # Initialize this object in case is a new Table + + # Try purely descriptive description objects. + if new and isinstance(description, dict): + # Dictionary case + self.description = Description(description, + ptparams=parentnode._v_file.params) + elif new and (type(description) == type(IsDescription) + and issubclass(description, IsDescription)): + # IsDescription subclass case + descr = description() + self.description = Description(descr.columns, + ptparams=parentnode._v_file.params) + elif new and isinstance(description, Description): + # It is a Description instance already + self.description = description + + # No description yet? + if new and self.description is None: + # Try NumPy dtype instances + if isinstance(description, np.dtype): + tup = descr_from_dtype(description, + ptparams=parentnode._v_file.params) + self.description, self._rabyteorder = tup + + # No description yet? + if new and self.description is None: + # Try structured array description objects. + try: + self._descflavor = flavor = flavor_of(description) + except TypeError: # probably not an array + pass + else: + if flavor == 'python': + nparray = np.rec.array(description) + else: + nparray = array_as_internal(description, flavor) + self.nrows = nrows = SizeType(nparray.size) + # If `self._v_recarray` is set, it will be used as the + # initial buffer. + if nrows > 0: + self._v_recarray = nparray + tup = descr_from_dtype(nparray.dtype, + ptparams=parentnode._v_file.params) + self.description, self._rabyteorder = tup + + # No description yet? + if new and self.description is None: + raise TypeError( + "the ``description`` argument is not of a supported type: " + "``IsDescription`` subclass, ``Description`` instance, " + "dictionary, or structured array") + + # Check the chunkshape parameter + if new and chunkshape is not None: + if isinstance(chunkshape, (int, np.integer)): + chunkshape = (chunkshape,) + try: + chunkshape = tuple(chunkshape) + except TypeError: + raise TypeError( + "`chunkshape` parameter must be an integer or sequence " + "and you passed a %s" % type(chunkshape)) + if len(chunkshape) != 1: + raise ValueError("`chunkshape` rank (length) must be 1: %r" + % (chunkshape,)) + self._v_chunkshape = tuple(SizeType(s) for s in chunkshape) + + super().__init__(parentnode, name, new, filters, byteorder, _log, + track_times) + + def _g_post_init_hook(self): + # We are putting here the index-related issues + # as well as filling general info for table + # This is needed because we need first the index objects created + + # First, get back the flavor of input data (if any) for + # `Leaf._g_post_init_hook()`. + self._flavor, self._descflavor = self._descflavor, None + super()._g_post_init_hook() + + # Create a cols accessor. + self.cols = Cols(self, self.description) + + # Place the `Cols` and `Column` objects into `self.colinstances`. + colinstances, cols = self.colinstances, self.cols + for colpathname in self.description._v_pathnames: + colinstances[colpathname] = cols._g_col(colpathname) + + if self._v_new: + # Columns are never indexed on creation. + self.colindexed = {cpn: False for cpn in self.colpathnames} + return + + # The following code is only for opened tables. + + # Do the indexes group exist? + indexesgrouppath = _index_pathname_of(self) + igroup = indexesgrouppath in self._v_file + oldindexes = False + for colobj in self.description._f_walk(type="Col"): + colname = colobj._v_pathname + # Is this column indexed? + if igroup: + indexname = _index_pathname_of_column(self, colname) + indexed = indexname in self._v_file + self.colindexed[colname] = indexed + if indexed: + column = self.cols._g_col(colname) + indexobj = column.index + if isinstance(indexobj, OldIndex): + indexed = False # Not a vaild index + oldindexes = True + self._listoldindexes.append(colname) + else: + # Tell the condition cache about columns with dirty + # indexes. + if indexobj.dirty: + self._condition_cache.nail() + else: + indexed = False + self.colindexed[colname] = False + if indexed: + self.indexed = True + + if oldindexes: # this should only appear under 2.x Pro + warnings.warn( + "table ``%s`` has column indexes with PyTables 1.x format. " + "Unfortunately, this format is not supported in " + "PyTables 2.x series. Note that you can use the " + "``ptrepack`` utility in order to recreate the indexes. " + "The 1.x indexed columns found are: %s" % + (self._v_pathname, self._listoldindexes), + OldIndexWarning) + + # It does not matter to which column 'indexobj' belongs, + # since their respective index objects share + # the same number of elements. + if self.indexed: + self._indexedrows = indexobj.nelements + self._unsaved_indexedrows = self.nrows - self._indexedrows + # Put the autoindex value in a cache variable + self._autoindex = self.autoindex + + def _calc_nrowsinbuf(self): + """Calculate the number of rows that fits on a PyTables buffer.""" + + params = self._v_file.params + # Compute the nrowsinbuf + rowsize = self.rowsize + buffersize = params['IO_BUFFER_SIZE'] + if rowsize != 0: + nrowsinbuf = buffersize // rowsize + # The number of rows in buffer needs to be an exact multiple of + # chunkshape[0] for queries using indexed columns. + # Fixes #319 and probably #409 too. + nrowsinbuf -= nrowsinbuf % self.chunkshape[0] + else: + nrowsinbuf = 1 + + # tableextension.pyx performs an assertion + # to make sure nrowsinbuf is greater than or + # equal to the chunksize. + # See gh-206 and gh-238 + if self.chunkshape is not None: + if nrowsinbuf < self.chunkshape[0]: + nrowsinbuf = self.chunkshape[0] + + # Safeguard against row sizes being extremely large + if nrowsinbuf == 0: + nrowsinbuf = 1 + # If rowsize is too large, issue a Performance warning + maxrowsize = params['BUFFER_TIMES'] * buffersize + if rowsize > maxrowsize: + warnings.warn("""\ +The Table ``%s`` is exceeding the maximum recommended rowsize (%d bytes); +be ready to see PyTables asking for *lots* of memory and possibly slow +I/O. You may want to reduce the rowsize by trimming the value of +dimensions that are orthogonal (and preferably close) to the *main* +dimension of this leave. Alternatively, in case you have specified a +very small/large chunksize, you may want to increase/decrease it.""" + % (self._v_pathname, maxrowsize), + PerformanceWarning) + return nrowsinbuf + + def _getemptyarray(self, dtype): + # Acts as a cache for empty arrays + key = dtype + if key in self._empty_array_cache: + return self._empty_array_cache[key] + else: + self._empty_array_cache[ + key] = arr = np.empty(shape=0, dtype=key) + return arr + + def _get_container(self, shape): + """Get the appropriate buffer for data depending on table + nestedness.""" + + # This is *much* faster than the numpy.rec.array counterpart + return np.empty(shape=shape, dtype=self._v_dtype) + + def _get_type_col_names(self, type_): + """Returns a list containing 'type_' column names.""" + + return [colobj._v_pathname + for colobj in self.description._f_walk('Col') + if colobj.type == type_] + + def _get_enum_map(self): + """Return mapping from enumerated column names to `Enum` instances.""" + + enumMap = {} + for colobj in self.description._f_walk('Col'): + if colobj.kind == 'enum': + enumMap[colobj._v_pathname] = colobj.enum + return enumMap + + def _g_create(self): + """Create a new table on disk.""" + + # Warning against assigning too much columns... + # F. Alted 2005-06-05 + maxColumns = self._v_file.params['MAX_COLUMNS'] + if (len(self.description._v_names) > maxColumns): + warnings.warn( + "table ``%s`` is exceeding the recommended " + "maximum number of columns (%d); " + "be ready to see PyTables asking for *lots* of memory " + "and possibly slow I/O" % (self._v_pathname, maxColumns), + PerformanceWarning) + + # 1. Create the HDF5 table (some parameters need to be computed). + + # Fix the byteorder of the recarray and update the number of + # expected rows if necessary + if self._v_recarray is not None: + self._v_recarray = self._g_fix_byteorder_data(self._v_recarray, + self._rabyteorder) + if len(self._v_recarray) > self._v_expectedrows: + self._v_expectedrows = len(self._v_recarray) + # Compute a sensible chunkshape + if self._v_chunkshape is None: + self._v_chunkshape = self._calc_chunkshape( + self._v_expectedrows, self.rowsize, self.rowsize) + # Correct the byteorder, if still needed + if self.byteorder is None: + self.byteorder = sys.byteorder + + # Cache some data which is already in the description. + # This is necessary to happen before creation time in order + # to be able to populate the self._v_wdflts + self._cache_description_data() + + # After creating the table, ``self._v_objectid`` needs to be + # set because it is needed for setting attributes afterwards. + self._v_objectid = self._create_table( + self._v_new_title, self.filters.complib or '', obversion) + self._v_recarray = None # not useful anymore + self._rabyteorder = None # not useful anymore + + # 2. Compute or get chunk shape and buffer size parameters. + self.nrowsinbuf = self._calc_nrowsinbuf() + + # 3. Get field fill attributes from the table description and + # set them on disk. + if self._v_file.params['PYTABLES_SYS_ATTRS']: + set_attr = self._v_attrs._g__setattr + for i, colobj in enumerate(self.description._f_walk(type="Col")): + fieldname = "FIELD_%d_FILL" % i + set_attr(fieldname, colobj.dflt) + + return self._v_objectid + + def _g_open(self): + """Opens a table from disk and read the metadata on it. + + Creates an user description on the flight to easy the access to + the actual data. + + """ + + # 1. Open the HDF5 table and get some data from it. + self._v_objectid, description, chunksize = self._get_info() + self._v_expectedrows = self.nrows # the actual number of rows + + # 2. Create an instance description to host the record fields. + validate = not self._v_file._isPTFile # only for non-PyTables files + self.description = Description(description, validate=validate, + ptparams=self._v_file.params) + + # 3. Compute or get chunk shape and buffer size parameters. + if chunksize == 0: + self._v_chunkshape = self._calc_chunkshape( + self._v_expectedrows, self.rowsize, self.rowsize) + else: + self._v_chunkshape = (chunksize,) + self.nrowsinbuf = self._calc_nrowsinbuf() + + # 4. If there are field fill attributes, get them from disk and + # set them in the table description. + if self._v_file.params['PYTABLES_SYS_ATTRS']: + if "FIELD_0_FILL" in self._v_attrs._f_list("sys"): + i = 0 + get_attr = self._v_attrs.__getattr__ + for objcol in self.description._f_walk(type="Col"): + colname = objcol._v_pathname + # Get the default values for each column + fieldname = "FIELD_%s_FILL" % i + defval = get_attr(fieldname) + if defval is not None: + objcol.dflt = defval + else: + warnings.warn("could not load default value " + "for the ``%s`` column of table ``%s``; " + "using ``%r`` instead" + % (colname, self._v_pathname, + objcol.dflt)) + defval = objcol.dflt + i += 1 + + # Set also the correct value in the desc._v_dflts dictionary + for descr in self.description._f_walk(type="Description"): + for name in descr._v_names: + objcol = descr._v_colobjects[name] + if isinstance(objcol, Col): + descr._v_dflts[objcol._v_name] = objcol.dflt + + # 5. Cache some data which is already in the description. + self._cache_description_data() + + return self._v_objectid + + def _cache_description_data(self): + """Cache some data which is already in the description. + + Some information is extracted from `self.description` to build + some useful (but redundant) structures: + + * `self.colnames` + * `self.colpathnames` + * `self.coldescrs` + * `self.coltypes` + * `self.coldtypes` + * `self.coldflts` + * `self._v_dtype` + * `self._time64colnames` + * `self._strcolnames` + * `self._colenums` + + """ + + self.colnames = list(self.description._v_names) + self.colpathnames = [ + col._v_pathname for col in self.description._f_walk() + if not hasattr(col, '_v_names')] # bottom-level + + # Find ``time64`` column names. + self._time64colnames = self._get_type_col_names('time64') + # Find ``string`` column names. + self._strcolnames = self._get_type_col_names('string') + # Get a mapping of enumerated columns to their `Enum` instances. + self._colenums = self._get_enum_map() + + # Get info about columns + for colobj in self.description._f_walk(type="Col"): + colname = colobj._v_pathname + # Get the column types, types and defaults + self.coldescrs[colname] = colobj + self.coltypes[colname] = colobj.type + self.coldtypes[colname] = colobj.dtype + self.coldflts[colname] = colobj.dflt + + # Assign _v_dtype for this table + self._v_dtype = self.description._v_dtype + + def _get_column_instance(self, colpathname): + """Get the instance of the column with the given `colpathname`. + + If the column does not exist in the table, a `KeyError` is + raised. + + """ + + try: + return functools.reduce( + getattr, colpathname.split('/'), self.description) + except AttributeError: + raise KeyError("table ``%s`` does not have a column named ``%s``" + % (self._v_pathname, colpathname)) + + _check_column = _get_column_instance + + def _disable_indexing_in_queries(self): + """Force queries not to use indexing. + + *Use only for testing.* + + """ + + if not self._enabled_indexing_in_queries: + return # already disabled + # The nail avoids setting/getting compiled conditions in/from + # the cache where indexing is used. + self._condition_cache.nail() + self._enabled_indexing_in_queries = False + + def _enable_indexing_in_queries(self): + """Allow queries to use indexing. + + *Use only for testing.* + + """ + + if self._enabled_indexing_in_queries: + return # already enabled + self._condition_cache.unnail() + self._enabled_indexing_in_queries = True + + def _required_expr_vars(self, expression, uservars, depth=1): + """Get the variables required by the `expression`. + + A new dictionary defining the variables used in the `expression` + is returned. Required variables are first looked up in the + `uservars` mapping, then in the set of top-level columns of the + table. Unknown variables cause a `NameError` to be raised. + + When `uservars` is `None`, the local and global namespace where + the API callable which uses this method is called is sought + instead. This mechanism will not work as expected if this + method is not used *directly* from an API callable. To disable + this mechanism, just specify a mapping as `uservars`. + + Nested columns and columns from other tables are not allowed + (`TypeError` and `ValueError` are raised, respectively). Also, + non-column variable values are converted to NumPy arrays. + + `depth` specifies the depth of the frame in order to reach local + or global variables. + + """ + + # Get the names of variables used in the expression. + exprvarscache = self._exprvars_cache + if expression not in exprvarscache: + # Protection against growing the cache too much + if len(exprvarscache) > 256: + # Remove 10 (arbitrary) elements from the cache + for k in list(exprvarscache)[:10]: + del exprvarscache[k] + cexpr = compile(expression, '', 'eval') + exprvars = [var for var in cexpr.co_names + if var not in ['None', 'False', 'True'] + and var not in ne.expressions.functions] + exprvarscache[expression] = exprvars + else: + exprvars = exprvarscache[expression] + + # Get the local and global variable mappings of the user frame + # if no mapping has been explicitly given for user variables. + user_locals, user_globals = {}, {} + if uservars is None: + # We use specified depth to get the frame where the API + # callable using this method is called. For instance: + # + # * ``table._required_expr_vars()`` (depth 0) is called by + # * ``table._where()`` (depth 1) is called by + # * ``table.where()`` (depth 2) is called by + # * user-space functions (depth 3) + user_frame = sys._getframe(depth) + user_locals = user_frame.f_locals + user_globals = user_frame.f_globals + + colinstances = self.colinstances + tblfile, tblpath = self._v_file, self._v_pathname + # Look for the required variables first among the ones + # explicitly provided by the user, then among implicit columns, + # then among external variables (only if no explicit variables). + reqvars = {} + for var in exprvars: + # Get the value. + if uservars is not None and var in uservars: + val = uservars[var] + elif var in colinstances: + val = colinstances[var] + elif uservars is None and var in user_locals: + val = user_locals[var] + elif uservars is None and var in user_globals: + val = user_globals[var] + else: + raise NameError("name ``%s`` is not defined" % var) + + # Check the value. + if hasattr(val, 'pathname'): # non-nested column + if val.shape[1:] != (): + raise NotImplementedError( + "variable ``%s`` refers to " + "a multidimensional column, " + "not yet supported in conditions, sorry" % var) + if (val._table_file is not tblfile or + val._table_path != tblpath): + raise ValueError("variable ``%s`` refers to a column " + "which is not part of table ``%s``" + % (var, tblpath)) + if val.dtype.str[1:] == 'u8': + raise NotImplementedError( + "variable ``%s`` refers to " + "a 64-bit unsigned integer column, " + "not yet supported in conditions, sorry; " + "please use regular Python selections" % var) + elif hasattr(val, '_v_colpathnames'): # nested column + raise TypeError( + "variable ``%s`` refers to a nested column, " + "not allowed in conditions" % var) + else: # only non-column values are converted to arrays + # XXX: not 100% sure about this + if isinstance(val, str): + val = np.asarray(val.encode('ascii')) + else: + val = np.asarray(val) + reqvars[var] = val + return reqvars + + def _get_condition_key(self, condition, condvars): + """Get the condition cache key for `condition` with `condvars`. + + Currently, the key is a tuple of `condition`, column variables + names, normal variables names, column paths and variable paths + (all are tuples). + + """ + + # Variable names for column and normal variables. + colnames, varnames = [], [] + # Column paths and types for each of the previous variable. + colpaths, vartypes = [], [] + for (var, val) in condvars.items(): + if hasattr(val, 'pathname'): # column + colnames.append(var) + colpaths.append(val.pathname) + else: # array + try: + varnames.append(var) + vartypes.append(ne.necompiler.getType(val)) # expensive + except ValueError: + # This is more clear than the error given by Numexpr. + raise TypeError("variable ``%s`` has data type ``%s``, " + "not allowed in conditions" + % (var, val.dtype.name)) + colnames, varnames = tuple(colnames), tuple(varnames) + colpaths, vartypes = tuple(colpaths), tuple(vartypes) + condkey = (condition, colnames, varnames, colpaths, vartypes) + return condkey + + def _compile_condition(self, condition, condvars): + """Compile the `condition` and extract usable index conditions. + + This method returns an instance of ``CompiledCondition``. See + the ``compile_condition()`` function in the ``conditions`` + module for more information about the compilation process. + + This method makes use of the condition cache when possible. + + """ + + # Look up the condition in the condition cache. + condcache = self._condition_cache + condkey = self._get_condition_key(condition, condvars) + compiled = condcache.get(condkey) + if compiled: + return compiled.with_replaced_vars(condvars) # bingo! + + # Bad luck, the condition must be parsed and compiled. + # Fortunately, the key provides some valuable information. ;) + (condition, colnames, varnames, colpaths, vartypes) = condkey + + # Extract more information from referenced columns. + + # start with normal variables + typemap = dict(list(zip(varnames, vartypes))) + indexedcols = [] + for colname in colnames: + col = condvars[colname] + + # Extract types from *all* the given variables. + coltype = col.dtype.type + typemap[colname] = _nxtype_from_nptype[coltype] + + # Get the set of columns with usable indexes. + if (self._enabled_indexing_in_queries # no in-kernel searches + and self.colindexed[col.pathname] and not col.index.dirty): + indexedcols.append(colname) + + indexedcols = frozenset(indexedcols) + # Now let ``compile_condition()`` do the Numexpr-related job. + compiled = compile_condition(condition, typemap, indexedcols) + + # Check that there actually are columns in the condition. + if not set(compiled.parameters).intersection(set(colnames)): + raise ValueError("there are no columns taking part " + "in condition ``%s``" % (condition,)) + + # Store the compiled condition in the cache and return it. + condcache[condkey] = compiled + return compiled.with_replaced_vars(condvars) + + def will_query_use_indexing(self, condition, condvars=None): + """Will a query for the condition use indexing? + + The meaning of the condition and *condvars* arguments is the same as in + the :meth:`Table.where` method. If condition can use indexing, this + method returns a frozenset with the path names of the columns whose + index is usable. Otherwise, it returns an empty list. + + This method is mainly intended for testing. Keep in mind that changing + the set of indexed columns or their dirtiness may make this method + return different values for the same arguments at different times. + + """ + + # Compile the condition and extract usable index conditions. + condvars = self._required_expr_vars(condition, condvars, depth=2) + compiled = self._compile_condition(condition, condvars) + # Return the columns in indexed expressions + idxcols = [condvars[var].pathname for var in compiled.index_variables] + return frozenset(idxcols) + + def where(self, condition, condvars=None, + start=None, stop=None, step=None): + r"""Iterate over values fulfilling a condition. + + This method returns a Row iterator (see :ref:`RowClassDescr`) which + only selects rows in the table that satisfy the given condition (an + expression-like string). + + The condvars mapping may be used to define the variable names appearing + in the condition. condvars should consist of identifier-like strings + pointing to Column (see :ref:`ColumnClassDescr`) instances *of this + table*, or to other values (which will be converted to arrays). A + default set of condition variables is provided where each top-level, + non-nested column with an identifier-like name appears. Variables in + condvars override the default ones. + + When condvars is not provided or None, the current local and global + namespace is sought instead of condvars. The previous mechanism is + mostly intended for interactive usage. To disable it, just specify a + (maybe empty) mapping as condvars. + + If a range is supplied (by setting some of the start, stop or step + parameters), only the rows in that range and fulfilling the condition + are used. The meaning of the start, stop and step parameters is the + same as for Python slices. + + When possible, indexed columns participating in the condition will be + used to speed up the search. It is recommended that you place the + indexed columns as left and out in the condition as possible. Anyway, + this method has always better performance than regular Python + selections on the table. + + You can mix this method with regular Python selections in order to + support even more complex queries. It is strongly recommended that you + pass the most restrictive condition as the parameter to this method if + you want to achieve maximum performance. + + .. warning:: + + When in the middle of a table row iterator, you should not + use methods that can change the number of rows in the table + (like :meth:`Table.append` or :meth:`Table.remove_rows`) or + unexpected errors will happen. + + Examples + -------- + + :: + + passvalues = [ row['col3'] for row in + table.where('(col1 > 0) & (col2 <= 20)', step=5) + if your_function(row['col2']) ] + print("Values that pass the cuts:", passvalues) + + .. note:: + + A special care should be taken when the query condition includes + string literals. + + Let's assume that the table ``table`` has the following + structure:: + + class Record(IsDescription): + col1 = StringCol(4) # 4-character String of bytes + col2 = IntCol() + col3 = FloatCol() + + The type of "col1" corresponds to strings of bytes. + + Any condition involving "col1" should be written using the + appropriate type for string literals in order to avoid + :exc:`TypeError`\ s. + + The code below will fail with a :exc:`TypeError`:: + + condition = 'col1 == "AAAA"' + for record in table.where(condition): # TypeError in Python3 + # do something with "record" + + The reason is that in Python 3 "condition" implies a comparison + between a string of bytes ("col1" contents) and a unicode literal + ("AAAA"). + + The correct way to write the condition is:: + + condition = 'col1 == b"AAAA"' + + .. versionchanged:: 3.0 + The start, stop and step parameters now behave like in slice. + + """ + + return self._where(condition, condvars, start, stop, step) + + def _where(self, condition, condvars, start=None, stop=None, step=None): + """Low-level counterpart of `self.where()`.""" + + if profile: + tref = clock() + if profile: + show_stats("Entering table._where", tref) + # Adjust the slice to be used. + (start, stop, step) = self._process_range_read(start, stop, step) + if start >= stop: # empty range, reset conditions + self._use_index = False + self._where_condition = None + return iter([]) + + # Compile the condition and extract usable index conditions. + condvars = self._required_expr_vars(condition, condvars, depth=3) + compiled = self._compile_condition(condition, condvars) + + # Can we use indexes? + if compiled.index_expressions: + chunkmap = _table__where_indexed( + self, compiled, condition, condvars, start, stop, step) + if not isinstance(chunkmap, np.ndarray): + # If it is not a NumPy array it should be an iterator + # Reset conditions + self._use_index = False + self._where_condition = None + # ...and return the iterator + return chunkmap + else: + chunkmap = None # default to an in-kernel query + + args = [condvars[param] for param in compiled.parameters] + self._where_condition = (compiled.function, args, compiled.kwargs) + row = tableextension.Row(self) + if profile: + show_stats("Exiting table._where", tref) + return row._iter(start, stop, step, chunkmap=chunkmap) + + def read_where(self, condition, condvars=None, field=None, + start=None, stop=None, step=None): + """Read table data fulfilling the given *condition*. + + This method is similar to :meth:`Table.read`, having their common + arguments and return values the same meanings. However, only the rows + fulfilling the *condition* are included in the result. + + The meaning of the other arguments is the same as in the + :meth:`Table.where` method. + + """ + + self._g_check_open() + coords = [p.nrow for p in + self._where(condition, condvars, start, stop, step)] + self._where_condition = None # reset the conditions + if len(coords) > 1: + cstart, cstop = coords[0], coords[-1] + 1 + if cstop - cstart == len(coords): + # Chances for monotonically increasing row values. Refine. + inc_seq = np.alltrue( + np.arange(cstart, cstop) == np.array(coords)) + if inc_seq: + return self.read(cstart, cstop, field=field) + return self.read_coordinates(coords, field) + + def append_where(self, dstTable, condition=None, condvars=None, + start=None, stop=None, step=None): + """Append rows fulfilling the condition to the dstTable table. + + dstTable must be capable of taking the rows resulting from the query, + i.e. it must have columns with the expected names and compatible + types. The meaning of the other arguments is the same as in the + :meth:`Table.where` method. + + The number of rows appended to dstTable is returned as a result. + + .. versionchanged:: 3.0 + The *whereAppend* method has been renamed into *append_where*. + + """ + + self._g_check_open() + + # Check that the destination file is not in read-only mode. + dstTable._v_file._check_writable() + + # Row objects do not support nested columns, so we must iterate + # over the flat column paths. When rows support nesting, + # ``self.colnames`` can be directly iterated upon. + colNames = [colName for colName in self.colpathnames] + dstRow = dstTable.row + nrows = 0 + if condition is not None: + srcRows = self._where(condition, condvars, start, stop, step) + else: + srcRows = self.iterrows(start, stop, step) + for srcRow in srcRows: + for colName in colNames: + dstRow[colName] = srcRow[colName] + dstRow.append() + nrows += 1 + dstTable.flush() + return nrows + + def get_where_list(self, condition, condvars=None, sort=False, + start=None, stop=None, step=None): + """Get the row coordinates fulfilling the given condition. + + The coordinates are returned as a list of the current flavor. sort + means that you want to retrieve the coordinates ordered. The default is + to not sort them. + + The meaning of the other arguments is the same as in the + :meth:`Table.where` method. + + """ + + self._g_check_open() + + coords = [p.nrow for p in + self._where(condition, condvars, start, stop, step)] + coords = np.array(coords, dtype=SizeType) + # Reset the conditions + self._where_condition = None + if sort: + coords = np.sort(coords) + return internal_to_flavor(coords, self.flavor) + + def itersequence(self, sequence): + """Iterate over a sequence of row coordinates.""" + + if not hasattr(sequence, '__getitem__'): + raise TypeError("Wrong 'sequence' parameter type. Only sequences " + "are suported.") + # start, stop and step are necessary for the new iterator for + # coordinates, and perhaps it would be useful to add them as + # parameters in the future (not now, because I've just removed + # the `sort` argument for 2.1). + # + # *Important note*: Negative values for step are not supported + # for the general case, but only for the itersorted() and + # read_sorted() purposes! The self._process_range_read will raise + # an appropiate error. + # F. Alted 2008-09-18 + # A.V. 20130513: _process_range_read --> _process_range + (start, stop, step) = self._process_range(None, None, None) + if (start > stop) or (len(sequence) == 0): + return iter([]) + row = tableextension.Row(self) + return row._iter(start, stop, step, coords=sequence) + + def _check_sortby_csi(self, sortby, checkCSI): + if isinstance(sortby, Column): + icol = sortby + elif isinstance(sortby, str): + icol = self.cols._f_col(sortby) + else: + raise TypeError( + "`sortby` can only be a `Column` or string object, " + "but you passed an object of type: %s" % type(sortby)) + if icol.is_indexed and icol.index.kind == "full": + if checkCSI and not icol.index.is_csi: + # The index exists, but it is not a CSI one. + raise ValueError( + "Field `%s` must have associated a CSI index " + "in table `%s`, but the existing one is not. " + % (sortby, self)) + return icol.index + else: + raise ValueError( + "Field `%s` must have associated a 'full' index " + "in table `%s`." % (sortby, self)) + + def itersorted(self, sortby, checkCSI=False, + start=None, stop=None, step=None): + """Iterate table data following the order of the index of sortby + column. + + The sortby column must have associated a full index. If you want to + ensure a fully sorted order, the index must be a CSI one. You may want + to use the checkCSI argument in order to explicitly check for the + existence of a CSI index. + + The meaning of the start, stop and step arguments is the same as in + :meth:`Table.read`. + + .. versionchanged:: 3.0 + If the *start* parameter is provided and *stop* is None then the + table is iterated from *start* to the last line. + In PyTables < 3.0 only one element was returned. + + """ + + index = self._check_sortby_csi(sortby, checkCSI) + # Adjust the slice to be used. + (start, stop, step) = self._process_range(start, stop, step, + warn_negstep=False) + if (start > stop and 0 < step) or (start < stop and 0 > step): + # Fall-back action is to return an empty iterator + return iter([]) + row = tableextension.Row(self) + return row._iter(start, stop, step, coords=index) + + def read_sorted(self, sortby, checkCSI=False, field=None, + start=None, stop=None, step=None): + """Read table data following the order of the index of sortby column. + + The sortby column must have associated a full index. If you want to + ensure a fully sorted order, the index must be a CSI one. You may want + to use the checkCSI argument in order to explicitly check for the + existence of a CSI index. + + If field is supplied only the named column will be selected. If the + column is not nested, an *array* of the current flavor will be + returned; if it is, a *structured array* will be used instead. If no + field is specified, all the columns will be returned in a structured + array of the current flavor. + + The meaning of the start, stop and step arguments is the same as in + :meth:`Table.read`. + + .. versionchanged:: 3.0 + The start, stop and step parameters now behave like in slice. + + """ + + self._g_check_open() + index = self._check_sortby_csi(sortby, checkCSI) + coords = index[start:stop:step] + return self.read_coordinates(coords, field) + + def iterrows(self, start=None, stop=None, step=None): + """Iterate over the table using a Row instance. + + If a range is not supplied, *all the rows* in the table are iterated + upon - you can also use the :meth:`Table.__iter__` special method for + that purpose. If you want to iterate over a given *range of rows* in + the table, you may use the start, stop and step parameters. + + .. warning:: + + When in the middle of a table row iterator, you should not + use methods that can change the number of rows in the table + (like :meth:`Table.append` or :meth:`Table.remove_rows`) or + unexpected errors will happen. + + See Also + -------- + tableextension.Row : the table row iterator and field accessor + + Examples + -------- + + :: + + result = [ row['var2'] for row in table.iterrows(step=5) + if row['var1'] <= 20 ] + + .. versionchanged:: 3.0 + If the *start* parameter is provided and *stop* is None then the + table is iterated from *start* to the last line. + In PyTables < 3.0 only one element was returned. + + """ + (start, stop, step) = self._process_range(start, stop, step, + warn_negstep=False) + if (start > stop and 0 < step) or (start < stop and 0 > step): + # Fall-back action is to return an empty iterator + return iter([]) + row = tableextension.Row(self) + return row._iter(start, stop, step) + + def __iter__(self): + """Iterate over the table using a Row instance. + + This is equivalent to calling :meth:`Table.iterrows` with default + arguments, i.e. it iterates over *all the rows* in the table. + + See Also + -------- + tableextension.Row : the table row iterator and field accessor + + Examples + -------- + + :: + + result = [ row['var2'] for row in table if row['var1'] <= 20 ] + + Which is equivalent to:: + + result = [ row['var2'] for row in table.iterrows() + if row['var1'] <= 20 ] + + """ + + return self.iterrows() + + def _read(self, start, stop, step, field=None, out=None): + """Read a range of rows and return an in-memory object.""" + + select_field = None + if field: + if field not in self.coldtypes: + if field in self.description._v_names: + # Remember to select this field + select_field = field + field = None + else: + raise KeyError(("Field {} not found in table " + "{}").format(field, self)) + else: + # The column hangs directly from the top + dtype_field = self.coldtypes[field] + + # Return a rank-0 array if start > stop + if (start >= stop and 0 < step) or (start <= stop and 0 > step): + if field is None: + nra = self._get_container(0) + return nra + return np.empty(shape=0, dtype=dtype_field) + + nrows = len(range(start, stop, step)) + + if out is None: + # Compute the shape of the resulting column object + if field: + # Create a container for the results + result = np.empty(shape=nrows, dtype=dtype_field) + else: + # Recarray case + result = self._get_container(nrows) + else: + # there is no fast way to byteswap, since different columns may + # have different byteorders + if not out.dtype.isnative: + raise ValueError("output array must be in system's byteorder " + "or results will be incorrect") + if field: + bytes_required = dtype_field.itemsize * nrows + else: + bytes_required = self.rowsize * nrows + if bytes_required != out.nbytes: + raise ValueError(f'output array size invalid, got {out.nbytes}' + f' bytes, need {bytes_required} bytes') + if not out.flags['C_CONTIGUOUS']: + raise ValueError('output array not C contiguous') + result = out + + # Call the routine to fill-up the resulting array + if step == 1 and not field: + # This optimization works three times faster than + # the row._fill_col method (up to 170 MB/s on a pentium IV @ 2GHz) + self._read_records(start, stop - start, result) + # Warning!: _read_field_name should not be used until + # H5TBread_fields_name in tableextension will be finished + # F. Alted 2005/05/26 + # XYX Ho implementem per a PyTables 2.0?? + elif field and step > 15 and 0: + # For step>15, this seems to work always faster than row._fill_col. + self._read_field_name(result, start, stop, step, field) + else: + self.row._fill_col(result, start, stop, step, field) + + if select_field: + return result[select_field] + else: + return result + + def read(self, start=None, stop=None, step=None, field=None, out=None): + """Get data in the table as a (record) array. + + The start, stop and step parameters can be used to select only + a *range of rows* in the table. Their meanings are the same as + in the built-in Python slices. + + If field is supplied only the named column will be selected. + If the column is not nested, an *array* of the current flavor + will be returned; if it is, a *structured array* will be used + instead. If no field is specified, all the columns will be + returned in a structured array of the current flavor. + + Columns under a nested column can be specified in the field + parameter by using a slash character (/) as a separator (e.g. + 'position/x'). + + The out parameter may be used to specify a NumPy array to + receive the output data. Note that the array must have the + same size as the data selected with the other parameters. + Note that the array's datatype is not checked and no type + casting is performed, so if it does not match the datatype on + disk, the output will not be correct. + + When specifying a single nested column with the field parameter, + and supplying an output buffer with the out parameter, the + output buffer must contain all columns in the table. + The data in all columns will be read into the output buffer. + However, only the specified nested column will be returned from + the method call. + + When data is read from disk in NumPy format, the output will be + in the current system's byteorder, regardless of how it is + stored on disk. If the out parameter is specified, the output + array also must be in the current system's byteorder. + + .. versionchanged:: 3.0 + Added the *out* parameter. Also the start, stop and step + parameters now behave like in slice. + + Examples + -------- + + Reading the entire table:: + + t.read() + + Reading record n. 6:: + + t.read(6, 7) + + Reading from record n. 6 to the end of the table:: + + t.read(6) + + """ + + self._g_check_open() + + if field: + self._check_column(field) + + if out is not None and self.flavor != 'numpy': + msg = ("Optional 'out' argument may only be supplied if array " + "flavor is 'numpy', currently is {}").format(self.flavor) + raise TypeError(msg) + + start, stop, step = self._process_range(start, stop, step, + warn_negstep=False) + + arr = self._read(start, stop, step, field, out) + return internal_to_flavor(arr, self.flavor) + + def _read_coordinates(self, coords, field=None): + """Private part of `read_coordinates()` with no flavor conversion.""" + + coords = self._point_selection(coords) + + ncoords = len(coords) + # Create a read buffer only if needed + if field is None or ncoords > 0: + # Doing a copy is faster when ncoords is small (<1000) + if ncoords < min(1000, self.nrowsinbuf): + result = self._v_iobuf[:ncoords].copy() + else: + result = self._get_container(ncoords) + + # Do the real read + if ncoords > 0: + # Turn coords into an array of coordinate indexes, if necessary + if not (isinstance(coords, np.ndarray) and + coords.dtype.type is _npsizetype and + coords.flags.contiguous and + coords.flags.aligned): + # Get a contiguous and aligned coordinate array + coords = np.array(coords, dtype=SizeType) + self._read_elements(coords, result) + + # Do the final conversions, if needed + if field: + if ncoords > 0: + result = get_nested_field(result, field) + else: + # Get an empty array from the cache + result = self._getemptyarray(self.coldtypes[field]) + return result + + def read_coordinates(self, coords, field=None): + """Get a set of rows given their indexes as a (record) array. + + This method works much like the :meth:`Table.read` method, but it uses + a sequence (coords) of row indexes to select the wanted columns, + instead of a column range. + + The selected rows are returned in an array or structured array of the + current flavor. + + """ + + self._g_check_open() + result = self._read_coordinates(coords, field) + return internal_to_flavor(result, self.flavor) + + def get_enum(self, colname): + """Get the enumerated type associated with the named column. + + If the column named colname (a string) exists and is of an enumerated + type, the corresponding Enum instance (see :ref:`EnumClassDescr`) is + returned. If it is not of an enumerated type, a TypeError is raised. If + the column does not exist, a KeyError is raised. + + """ + + self._check_column(colname) + + try: + return self._colenums[colname] + except KeyError: + raise TypeError( + "column ``%s`` of table ``%s`` is not of an enumerated type" + % (colname, self._v_pathname)) + + def col(self, name): + """Get a column from the table. + + If a column called name exists in the table, it is read and returned as + a NumPy object. If it does not exist, a KeyError is raised. + + Examples + -------- + + :: + + narray = table.col('var2') + + That statement is equivalent to:: + + narray = table.read(field='var2') + + Here you can see how this method can be used as a shorthand for the + :meth:`Table.read` method. + + """ + + return self.read(field=name) + + def __getitem__(self, key): + """Get a row or a range of rows from the table. + + If key argument is an integer, the corresponding table row is returned + as a record of the current flavor. If key is a slice, the range of rows + determined by it is returned as a structured array of the current + flavor. + + In addition, NumPy-style point selections are supported. In + particular, if key is a list of row coordinates, the set of rows + determined by it is returned. Furthermore, if key is an array of + boolean values, only the coordinates where key is True are returned. + Note that for the latter to work it is necessary that key list would + contain exactly as many rows as the table has. + + Examples + -------- + + :: + + record = table[4] + recarray = table[4:1000:2] + recarray = table[[4,1000]] # only retrieves rows 4 and 1000 + recarray = table[[True, False, ..., True]] + + Those statements are equivalent to:: + + record = table.read(start=4)[0] + recarray = table.read(start=4, stop=1000, step=2) + recarray = table.read_coordinates([4,1000]) + recarray = table.read_coordinates([True, False, ..., True]) + + Here, you can see how indexing can be used as a shorthand for the + :meth:`Table.read` and :meth:`Table.read_coordinates` methods. + + """ + + self._g_check_open() + + if is_idx(key): + key = operator.index(key) + + # Index out of range protection + if key >= self.nrows: + raise IndexError("Index out of range") + if key < 0: + # To support negative values + key += self.nrows + (start, stop, step) = self._process_range(key, key + 1, 1) + return self.read(start, stop, step)[0] + elif isinstance(key, slice): + (start, stop, step) = self._process_range( + key.start, key.stop, key.step) + return self.read(start, stop, step) + # Try with a boolean or point selection + elif type(key) in (list, tuple) or isinstance(key, np.ndarray): + return self._read_coordinates(key, None) + else: + raise IndexError(f"Invalid index or slice: {key!r}") + + def __setitem__(self, key, value): + """Set a row or a range of rows in the table. + + It takes different actions depending on the type of the *key* + parameter: if it is an integer, the corresponding table row is + set to *value* (a record or sequence capable of being converted + to the table structure). If *key* is a slice, the row slice + determined by it is set to *value* (a record array or sequence + capable of being converted to the table structure). + + In addition, NumPy-style point selections are supported. In + particular, if key is a list of row coordinates, the set of rows + determined by it is set to value. Furthermore, if key is an array of + boolean values, only the coordinates where key is True are set to + values from value. Note that for the latter to work it is necessary + that key list would contain exactly as many rows as the table has. + + Examples + -------- + + :: + + # Modify just one existing row + table[2] = [456,'db2',1.2] + + # Modify two existing rows + rows = numpy.rec.array([[457,'db1',1.2],[6,'de2',1.3]], + formats='i4,a3,f8') + table[1:30:2] = rows # modify a table slice + table[[1,3]] = rows # only modifies rows 1 and 3 + table[[True,False,True]] = rows # only modifies rows 0 and 2 + + Which is equivalent to:: + + table.modify_rows(start=2, rows=[456,'db2',1.2]) + rows = numpy.rec.array([[457,'db1',1.2],[6,'de2',1.3]], + formats='i4,a3,f8') + table.modify_rows(start=1, stop=3, step=2, rows=rows) + table.modify_coordinates([1,3,2], rows) + table.modify_coordinates([True, False, True], rows) + + Here, you can see how indexing can be used as a shorthand for the + :meth:`Table.modify_rows` and :meth:`Table.modify_coordinates` + methods. + + """ + + self._g_check_open() + self._v_file._check_writable() + + if is_idx(key): + key = operator.index(key) + + # Index out of range protection + if key >= self.nrows: + raise IndexError("Index out of range") + if key < 0: + # To support negative values + key += self.nrows + return self.modify_rows(key, key + 1, 1, [value]) + elif isinstance(key, slice): + (start, stop, step) = self._process_range( + key.start, key.stop, key.step) + return self.modify_rows(start, stop, step, value) + # Try with a boolean or point selection + elif type(key) in (list, tuple) or isinstance(key, np.ndarray): + return self.modify_coordinates(key, value) + else: + raise IndexError(f"Invalid index or slice: {key!r}") + + def _save_buffered_rows(self, wbufRA, lenrows): + """Update the indexes after a flushing of rows.""" + + self._open_append(wbufRA) + self._append_records(lenrows) + self._close_append() + if self.indexed: + self._unsaved_indexedrows += lenrows + # The table caches for indexed queries are dirty now + self._dirtycache = True + if self.autoindex: + # Flush the unindexed rows + self.flush_rows_to_index(_lastrow=False) + else: + # All the columns are dirty now + self._mark_columns_as_dirty(self.colpathnames) + + def append(self, rows): + """Append a sequence of rows to the end of the table. + + The rows argument may be any object which can be converted to + a structured array compliant with the table structure + (otherwise, a ValueError is raised). This includes NumPy + structured arrays, lists of tuples or array records, and a + string or Python buffer. + + Examples + -------- + + :: + + import tables as tb + + class Particle(tb.IsDescription): + name = tb.StringCol(16, pos=1) # 16-character String + lati = tb.IntCol(pos=2) # integer + longi = tb.IntCol(pos=3) # integer + pressure = tb.Float32Col(pos=4) # float (single-precision) + temperature = tb.FloatCol(pos=5) # double (double-precision) + + fileh = tb.open_file('test4.h5', mode='w') + table = fileh.create_table(fileh.root, 'table', Particle, + "A table") + + # Append several rows in only one call + table.append([("Particle: 10", 10, 0, 10 * 10, 10**2), + ("Particle: 11", 11, -1, 11 * 11, 11**2), + ("Particle: 12", 12, -2, 12 * 12, 12**2)]) + fileh.close() + + """ + + self._g_check_open() + self._v_file._check_writable() + + if not self._chunked: + raise HDF5ExtError( + "You cannot append rows to a non-chunked table.", h5bt=False) + + # Try to convert the object into a recarray compliant with table + try: + iflavor = flavor_of(rows) + if iflavor != 'python': + rows = array_as_internal(rows, iflavor) + # Works for Python structures and always copies the original, + # so the resulting object is safe for in-place conversion. + wbufRA = np.rec.array(rows, dtype=self._v_dtype) + except Exception as exc: # XXX + raise ValueError("rows parameter cannot be converted into a " + "recarray object compliant with table '%s'. " + "The error was: <%s>" % (str(self), exc)) + lenrows = wbufRA.shape[0] + # If the number of rows to append is zero, don't do anything else + if lenrows > 0: + # Save write buffer to disk + self._save_buffered_rows(wbufRA, lenrows) + + def _conv_to_recarr(self, obj): + """Try to convert the object into a recarray.""" + + try: + iflavor = flavor_of(obj) + if iflavor != 'python': + obj = array_as_internal(obj, iflavor) + if hasattr(obj, "shape") and obj.shape == (): + # To allow conversion of scalars (void type) into arrays. + # See http://projects.scipy.org/scipy/numpy/ticket/315 + # for discussion on how to pass buffers to constructors + # See also http://projects.scipy.org/scipy/numpy/ticket/348 + recarr = np.array([obj], dtype=self._v_dtype) + else: + # Works for Python structures and always copies the original, + # so the resulting object is safe for in-place conversion. + recarr = np.rec.array(obj, dtype=self._v_dtype) + except Exception as exc: # XXX + raise ValueError("Object cannot be converted into a recarray " + "object compliant with table format '%s'. " + "The error was: <%s>" % + (self.description._v_nested_descr, exc)) + + return recarr + + def modify_coordinates(self, coords, rows): + """Modify a series of rows in positions specified in coords. + + The values in the selected rows will be modified with the data given in + rows. This method returns the number of rows modified. + + The possible values for the rows argument are the same as in + :meth:`Table.append`. + + """ + + if rows is None: # Nothing to be done + return SizeType(0) + + # Convert the coordinates to something expected by HDF5 + coords = self._point_selection(coords) + + lcoords = len(coords) + if len(rows) < lcoords: + raise ValueError("The value has not enough elements to fill-in " + "the specified range") + + # Convert rows into a recarray + recarr = self._conv_to_recarr(rows) + + if len(coords) > 0: + # Do the actual update of rows + self._update_elements(lcoords, coords, recarr) + + # Redo the index if needed + self._reindex(self.colpathnames) + + return SizeType(lcoords) + + def modify_rows(self, start=None, stop=None, step=None, rows=None): + """Modify a series of rows in the slice [start:stop:step]. + + The values in the selected rows will be modified with the data given in + rows. This method returns the number of rows modified. Should the + modification exceed the length of the table, an IndexError is raised + before changing data. + + The possible values for the rows argument are the same as in + :meth:`Table.append`. + + """ + + if step is None: + step = 1 + if rows is None: # Nothing to be done + return SizeType(0) + if start is None: + start = 0 + + if start < 0: + raise ValueError("'start' must have a positive value.") + if step < 1: + raise ValueError( + "'step' must have a value greater or equal than 1.") + if stop is None: + # compute the stop value. start + len(rows)*step does not work + stop = start + (len(rows) - 1) * step + 1 + + (start, stop, step) = self._process_range(start, stop, step) + if stop > self.nrows: + raise IndexError("This modification will exceed the length of " + "the table. Giving up.") + # Compute the number of rows to read. + nrows = len(range(start, stop, step)) + if len(rows) != nrows: + raise ValueError("The value has different elements than the " + "specified range") + + # Convert rows into a recarray + recarr = self._conv_to_recarr(rows) + + lenrows = len(recarr) + if start + lenrows > self.nrows: + raise IndexError("This modification will exceed the length of the " + "table. Giving up.") + + # Do the actual update + self._update_records(start, stop, step, recarr) + + # Redo the index if needed + self._reindex(self.colpathnames) + + return SizeType(lenrows) + + def modify_column(self, start=None, stop=None, step=None, + column=None, colname=None): + """Modify one single column in the row slice [start:stop:step]. + + The colname argument specifies the name of the column in the + table to be modified with the data given in column. This + method returns the number of rows modified. Should the + modification exceed the length of the table, an IndexError is + raised before changing data. + + The *column* argument may be any object which can be converted + to a (record) array compliant with the structure of the column + to be modified (otherwise, a ValueError is raised). This + includes NumPy (record) arrays, lists of scalars, tuples or + array records, and a string or Python buffer. + + """ + if step is None: + step = 1 + if not isinstance(colname, str): + raise TypeError("The 'colname' parameter must be a string.") + self._v_file._check_writable() + + if column is None: # Nothing to be done + return SizeType(0) + if start is None: + start = 0 + + if start < 0: + raise ValueError("'start' must have a positive value.") + if step < 1: + raise ValueError( + "'step' must have a value greater or equal than 1.") + # Get the column format to be modified: + objcol = self._get_column_instance(colname) + descr = [objcol._v_parent._v_nested_descr[objcol._v_pos]] + # Try to convert the column object into a NumPy ndarray + try: + # If the column is a recarray (or kind of), convert into ndarray + if hasattr(column, 'dtype') and column.dtype.kind == 'V': + column = np.rec.array(column, dtype=descr).field(0) + else: + # Make sure the result is always a *copy* of the original, + # so the resulting object is safe for in-place conversion. + iflavor = flavor_of(column) + column = array_as_internal(column, iflavor) + except Exception as exc: # XXX + raise ValueError("column parameter cannot be converted into a " + "ndarray object compliant with specified column " + "'%s'. The error was: <%s>" % (str(column), exc)) + + # Get rid of single-dimensional dimensions + column = column.squeeze() + if column.shape == (): + # Oops, stripped off to much dimensions + column.shape = (1,) + + if stop is None: + # compute the stop value. start + len(rows)*step does not work + stop = start + (len(column) - 1) * step + 1 + (start, stop, step) = self._process_range(start, stop, step) + if stop > self.nrows: + raise IndexError("This modification will exceed the length of " + "the table. Giving up.") + # Compute the number of rows to read. + nrows = len(range(start, stop, step)) + if len(column) < nrows: + raise ValueError("The value has not enough elements to fill-in " + "the specified range") + # Now, read the original values: + mod_recarr = self._read(start, stop, step) + # Modify the appropriate column in the original recarray + mod_col = get_nested_field(mod_recarr, colname) + mod_col[:] = column + # save this modified rows in table + self._update_records(start, stop, step, mod_recarr) + # Redo the index if needed + self._reindex([colname]) + + return SizeType(nrows) + + def modify_columns(self, start=None, stop=None, step=None, + columns=None, names=None): + """Modify a series of columns in the row slice [start:stop:step]. + + The names argument specifies the names of the columns in the + table to be modified with the data given in columns. This + method returns the number of rows modified. Should the + modification exceed the length of the table, an IndexError + is raised before changing data. + + The columns argument may be any object which can be converted + to a structured array compliant with the structure of the + columns to be modified (otherwise, a ValueError is raised). + This includes NumPy structured arrays, lists of tuples or array + records, and a string or Python buffer. + + """ + if step is None: + step = 1 + if type(names) not in (list, tuple): + raise TypeError("The 'names' parameter must be a list of strings.") + + if columns is None: # Nothing to be done + return SizeType(0) + if start is None: + start = 0 + if start < 0: + raise ValueError("'start' must have a positive value.") + if step < 1: + raise ValueError("'step' must have a value greater or " + "equal than 1.") + descr = [] + for colname in names: + objcol = self._get_column_instance(colname) + descr.append(objcol._v_parent._v_nested_descr[objcol._v_pos]) + # descr.append(objcol._v_parent._v_dtype[objcol._v_pos]) + # Try to convert the columns object into a recarray + try: + # Make sure the result is always a *copy* of the original, + # so the resulting object is safe for in-place conversion. + iflavor = flavor_of(columns) + if iflavor != 'python': + columns = array_as_internal(columns, iflavor) + recarray = np.rec.array(columns, dtype=descr) + else: + recarray = np.rec.fromarrays(columns, dtype=descr) + except Exception as exc: # XXX + raise ValueError("columns parameter cannot be converted into a " + "recarray object compliant with table '%s'. " + "The error was: <%s>" % (str(self), exc)) + + if stop is None: + # compute the stop value. start + len(rows)*step does not work + stop = start + (len(recarray) - 1) * step + 1 + (start, stop, step) = self._process_range(start, stop, step) + if stop > self.nrows: + raise IndexError("This modification will exceed the length of " + "the table. Giving up.") + # Compute the number of rows to read. + nrows = len(range(start, stop, step)) + if len(recarray) < nrows: + raise ValueError("The value has not enough elements to fill-in " + "the specified range") + # Now, read the original values: + mod_recarr = self._read(start, stop, step) + # Modify the appropriate columns in the original recarray + for i, name in enumerate(recarray.dtype.names): + mod_col = get_nested_field(mod_recarr, names[i]) + mod_col[:] = recarray[name].squeeze() + # save this modified rows in table + self._update_records(start, stop, step, mod_recarr) + # Redo the index if needed + self._reindex(names) + + return SizeType(nrows) + + def flush_rows_to_index(self, _lastrow=True): + """Add remaining rows in buffers to non-dirty indexes. + + This can be useful when you have chosen non-automatic indexing + for the table (see the :attr:`Table.autoindex` property in + :class:`Table`) and you want to update the indexes on it. + + """ + + rowsadded = 0 + if self.indexed: + # Update the number of unsaved indexed rows + start = self._indexedrows + nrows = self._unsaved_indexedrows + for (colname, colindexed) in self.colindexed.items(): + if colindexed: + col = self.cols._g_col(colname) + if nrows > 0 and not col.index.dirty: + rowsadded = self._add_rows_to_index( + colname, start, nrows, _lastrow, update=True) + self._unsaved_indexedrows -= rowsadded + self._indexedrows += rowsadded + return rowsadded + + def _add_rows_to_index(self, colname, start, nrows, lastrow, update): + """Add more elements to the existing index.""" + + # This method really belongs to Column, but since it makes extensive + # use of the table, it gets dangerous when closing the file, since the + # column may be accessing a table which is being destroyed. + index = self.cols._g_col(colname).index + slicesize = index.slicesize + # The next loop does not rely on xrange so that it can + # deal with long ints (i.e. more than 32-bit integers) + # This allows to index columns with more than 2**31 rows + # F. Alted 2005-05-09 + startLR = index.sorted.nrows * slicesize + indexedrows = startLR - start + stop = start + nrows - slicesize + 1 + while startLR < stop: + index.append( + [self._read(startLR, startLR + slicesize, 1, colname)], + update=update) + indexedrows += slicesize + startLR += slicesize + # index the remaining rows in last row + if lastrow and startLR < self.nrows: + index.append_last_row( + [self._read(startLR, self.nrows, 1, colname)], + update=update) + indexedrows += self.nrows - startLR + return indexedrows + + def remove_rows(self, start=None, stop=None, step=None): + """Remove a range of rows in the table. + + If only start is supplied, that row and all following will be deleted. + If a range is supplied, i.e. both the start and stop parameters are + passed, all the rows in the range are removed. + + .. versionchanged:: 3.0 + The start, stop and step parameters now behave like in slice. + + .. seealso:: remove_row() + + Parameters + ---------- + start : int + Sets the starting row to be removed. It accepts negative values + meaning that the count starts from the end. A value of 0 means the + first row. + stop : int + Sets the last row to be removed to stop-1, i.e. the end point is + omitted (in the Python range() tradition). Negative values are also + accepted. If None all rows after start will be removed. + step : int + The step size between rows to remove. + + .. versionadded:: 3.0 + + Examples + -------- + + Removing rows from 5 to 10 (excluded):: + + t.remove_rows(5, 10) + + Removing all rows starting from the 10th:: + + t.remove_rows(10) + + Removing the 6th row:: + + t.remove_rows(6, 7) + + .. note:: + + removing a single row can be done using the specific + :meth:`remove_row` method. + + """ + + (start, stop, step) = self._process_range(start, stop, step) + nrows = self._remove_rows(start, stop, step) + # remove_rows is a invalidating index operation + self._reindex(self.colpathnames) + + return SizeType(nrows) + + def remove_row(self, n): + """Removes a row from the table. + + Parameters + ---------- + n : int + The index of the row to remove. + + + .. versionadded:: 3.0 + + Examples + -------- + + Remove row 15:: + + table.remove_row(15) + + Which is equivalent to:: + + table.remove_rows(15, 16) + + .. warning:: + + This is not equivalent to:: + + table.remove_rows(15) + + """ + + self.remove_rows(start=n, stop=n + 1) + + def _g_update_dependent(self): + super()._g_update_dependent() + + # Update the new path in columns + self.cols._g_update_table_location(self) + + # Update the new path in the Row instance, if cached. Fixes #224. + if 'row' in self.__dict__: + self.__dict__['row'] = tableextension.Row(self) + + def _g_move(self, newparent, newname): + """Move this node in the hierarchy. + + This overloads the Node._g_move() method. + + """ + + itgpathname = _index_pathname_of(self) + + # First, move the table to the new location. + super()._g_move(newparent, newname) + + # Then move the associated index group (if any). + try: + itgroup = self._v_file._get_node(itgpathname) + except NoSuchNodeError: + pass + else: + newigroup = self._v_parent + newiname = _index_name_of(self) + itgroup._g_move(newigroup, newiname) + + def _g_remove(self, recursive=False, force=False): + # Remove the associated index group (if any). + itgpathname = _index_pathname_of(self) + try: + itgroup = self._v_file._get_node(itgpathname) + except NoSuchNodeError: + pass + else: + itgroup._f_remove(recursive=True) + self.indexed = False # there are indexes no more + + # Remove the leaf itself from the hierarchy. + super()._g_remove(recursive, force) + + def _set_column_indexing(self, colpathname, indexed): + """Mark the referred column as indexed or non-indexed.""" + + colindexed = self.colindexed + isindexed, wasindexed = bool(indexed), colindexed[colpathname] + if isindexed == wasindexed: + return # indexing state is unchanged + + # Changing the set of indexed columns invalidates the condition cache + self._condition_cache.clear() + colindexed[colpathname] = isindexed + self.indexed = max(colindexed.values()) # this is an OR :) + + def _mark_columns_as_dirty(self, colnames): + """Mark column indexes in `colnames` as dirty.""" + + assert len(colnames) > 0 + if self.indexed: + colindexed, cols = self.colindexed, self.cols + # Mark the proper indexes as dirty + for colname in colnames: + if colindexed[colname]: + col = cols._g_col(colname) + col.index.dirty = True + + def _reindex(self, colnames): + """Re-index columns in `colnames` if automatic indexing is true.""" + + if self.indexed: + colindexed, cols = self.colindexed, self.cols + colstoindex = [] + # Mark the proper indexes as dirty + for colname in colnames: + if colindexed[colname]: + col = cols._g_col(colname) + col.index.dirty = True + colstoindex.append(colname) + # Now, re-index the dirty ones + if self.autoindex and colstoindex: + self._do_reindex(dirty=True) + # The table caches for indexed queries are dirty now + self._dirtycache = True + + def _do_reindex(self, dirty): + """Common code for `reindex()` and `reindex_dirty()`.""" + + indexedrows = 0 + for (colname, colindexed) in self.colindexed.items(): + if colindexed: + indexcol = self.cols._g_col(colname) + indexedrows = indexcol._do_reindex(dirty) + # Update counters in case some column has been updated + if indexedrows > 0: + self._indexedrows = indexedrows + self._unsaved_indexedrows = self.nrows - indexedrows + + return SizeType(indexedrows) + + def reindex(self): + """Recompute all the existing indexes in the table. + + This can be useful when you suspect that, for any reason, the + index information for columns is no longer valid and want to + rebuild the indexes on it. + + """ + + self._do_reindex(dirty=False) + + def reindex_dirty(self): + """Recompute the existing indexes in table, *if* they are dirty. + + This can be useful when you have set :attr:`Table.autoindex` + (see :class:`Table`) to false for the table and you want to + update the indexes after a invalidating index operation + (:meth:`Table.remove_rows`, for example). + + """ + + self._do_reindex(dirty=True) + + def _g_copy_rows(self, object, start, stop, step, sortby, checkCSI): + """Copy rows from self to object""" + if sortby is None: + self._g_copy_rows_optim(object, start, stop, step) + return + lenbuf = self.nrowsinbuf + absstep = step + if step < 0: + absstep = -step + start, stop = stop + 1, start + 1 + if sortby is not None: + index = self._check_sortby_csi(sortby, checkCSI) + for start2 in range(start, stop, absstep * lenbuf): + stop2 = start2 + absstep * lenbuf + if stop2 > stop: + stop2 = stop + # The next 'if' is not needed, but it doesn't bother either + if sortby is None: + rows = self[start2:stop2:step] + else: + coords = index[start2:stop2:step] + rows = self.read_coordinates(coords) + # Save the records on disk + object.append(rows) + object.flush() + + def _g_copy_rows_optim(self, object, start, stop, step): + """Copy rows from self to object (optimized version)""" + + nrowsinbuf = self.nrowsinbuf + object._open_append(self._v_iobuf) + nrowsdest = object.nrows + for start2 in range(start, stop, step * nrowsinbuf): + # Save the records on disk + stop2 = start2 + step * nrowsinbuf + if stop2 > stop: + stop2 = stop + # Optimized version (it saves some conversions) + nrows = ((stop2 - start2 - 1) // step) + 1 + self.row._fill_col(self._v_iobuf, start2, stop2, step, None) + # The output buffer is created anew, + # so the operation is safe to in-place conversion. + object._append_records(nrows) + nrowsdest += nrows + object._close_append() + + def _g_prop_indexes(self, other): + """Generate index in `other` table for every indexed column here.""" + + oldcols, newcols = self.colinstances, other.colinstances + for colname in newcols: + if (isinstance(oldcols[colname], Column)): + oldcolindexed = oldcols[colname].is_indexed + if oldcolindexed: + oldcolindex = oldcols[colname].index + newcol = newcols[colname] + newcol.create_index( + kind=oldcolindex.kind, optlevel=oldcolindex.optlevel, + filters=oldcolindex.filters, tmp_dir=None) + + def _g_copy_with_stats(self, group, name, start, stop, step, + title, filters, chunkshape, _log, **kwargs): + """Private part of Leaf.copy() for each kind of leaf.""" + + # Get the private args for the Table flavor of copy() + sortby = kwargs.pop('sortby', None) + propindexes = kwargs.pop('propindexes', False) + checkCSI = kwargs.pop('checkCSI', False) + # Compute the correct indices. + (start, stop, step) = self._process_range_read( + start, stop, step, warn_negstep=sortby is None) + # And the number of final rows + nrows = len(range(start, stop, step)) + # Create the new table and copy the selected data. + newtable = Table(group, name, self.description, title=title, + filters=filters, expectedrows=nrows, + chunkshape=chunkshape, + _log=_log) + self._g_copy_rows(newtable, start, stop, step, sortby, checkCSI) + nbytes = newtable.nrows * newtable.rowsize + # Generate equivalent indexes in the new table, if required. + if propindexes and self.indexed: + self._g_prop_indexes(newtable) + return (newtable, nbytes) + + # This overloading of copy is needed here in order to document + # the additional keywords for the Table case. + def copy(self, newparent=None, newname=None, overwrite=False, + createparents=False, **kwargs): + """Copy this table and return the new one. + + This method has the behavior and keywords described in + :meth:`Leaf.copy`. Moreover, it recognises the following additional + keyword arguments. + + Parameters + ---------- + sortby + If specified, and sortby corresponds to a column with an index, + then the copy will be sorted by this index. If you want to ensure + a fully sorted order, the index must be a CSI one. A reverse + sorted copy can be achieved by specifying a negative value for the + step keyword. If sortby is omitted or None, the original table + order is used. + checkCSI + If true and a CSI index does not exist for the sortby column, an + error will be raised. If false (the default), it does nothing. + You can use this flag in order to explicitly check for the + existence of a CSI index. + propindexes + If true, the existing indexes in the source table are propagated + (created) to the new one. If false (the default), the indexes are + not propagated. + + """ + + return super().copy( + newparent, newname, overwrite, createparents, **kwargs) + + def flush(self): + """Flush the table buffers.""" + + if self._v_file._iswritable(): + # Flush rows that remains to be appended + if 'row' in self.__dict__: + self.row._flush_buffered_rows() + if self.indexed and self.autoindex: + # Flush any unindexed row + rowsadded = self.flush_rows_to_index(_lastrow=True) + assert rowsadded <= 0 or self._indexedrows == self.nrows, \ + ("internal error: the number of indexed rows (%d) " + "and rows in the table (%d) is not equal; " + "please report this to the authors." + % (self._indexedrows, self.nrows)) + if self._dirtyindexes: + # Finally, re-index any dirty column + self.reindex_dirty() + + super().flush() + + def _g_pre_kill_hook(self): + """Code to be called before killing the node.""" + + # Flush the buffers before to clean-up them + # self.flush() + # It seems that flushing during the __del__ phase is a sure receipt for + # bringing all kind of problems: + # 1. Illegal Instruction + # 2. Malloc(): trying to call free() twice + # 3. Bus Error + # 4. Segmentation fault + # So, the best would be doing *nothing* at all in this __del__ phase. + # As a consequence, the I/O will not be cleaned until a call to + # Table.flush() would be done. This could lead to a potentially large + # memory consumption. + # NOTE: The user should make a call to Table.flush() whenever he has + # finished working with his table. + # I've added a Performance warning in order to compel the user to + # call self.flush() before the table is being preempted. + # F. Alted 2006-08-03 + if (('row' in self.__dict__ and self.row._get_unsaved_nrows() > 0) or + (self.indexed and self.autoindex and + (self._unsaved_indexedrows > 0 or self._dirtyindexes))): + warnings.warn(("table ``%s`` is being preempted from alive nodes " + "without its buffers being flushed or with some " + "index being dirty. This may lead to very " + "ineficient use of resources and even to fatal " + "errors in certain situations. Please do a call " + "to the .flush() or .reindex_dirty() methods on " + "this table before start using other nodes.") + % (self._v_pathname), PerformanceWarning) + # Get rid of the IO buffers (if they have been created at all) + mydict = self.__dict__ + if '_v_iobuf' in mydict: + del mydict['_v_iobuf'] + if '_v_wdflts' in mydict: + del mydict['_v_wdflts'] + + def _f_close(self, flush=True): + if not self._v_isopen: + return # the node is already closed + + # .. note:: + # + # As long as ``Table`` objects access their indices on closing, + # ``File.close()`` will need to make *two separate passes* + # to first close ``Table`` objects and then ``Index`` hierarchies. + # + + # Flush right now so the row object does not get in the middle. + if flush: + self.flush() + + # Some warnings can be issued after calling `self._g_set_location()` + # in `self.__init__()`. If warnings are turned into exceptions, + # `self._g_post_init_hook` may not be called and `self.cols` not set. + # One example of this is + # ``test_create.createTestCase.test05_maxFieldsExceeded()``. + cols = self.cols + if cols is not None: + cols._g_close() + + # Close myself as a leaf. + super()._f_close(False) + + def __repr__(self): + """This provides column metainfo in addition to standard __str__""" + + if self.indexed: + format = """\ +%s + description := %r + byteorder := %r + chunkshape := %r + autoindex := %r + colindexes := %r""" + return format % (str(self), self.description, self.byteorder, + self.chunkshape, self.autoindex, + _ColIndexes(self.colindexes)) + else: + return """\ +%s + description := %r + byteorder := %r + chunkshape := %r""" % \ + (str(self), self.description, self.byteorder, self.chunkshape) + + +class Cols: + """Container for columns in a table or nested column. + + This class is used as an *accessor* to the columns in a table or nested + column. It supports the *natural naming* convention, so that you can + access the different columns as attributes which lead to Column instances + (for non-nested columns) or other Cols instances (for nested columns). + + For instance, if table.cols is a Cols instance with a column named col1 + under it, the later can be accessed as table.cols.col1. If col1 is nested + and contains a col2 column, this can be accessed as table.cols.col1.col2 + and so on. Because of natural naming, the names of members start with + special prefixes, like in the Group class (see :ref:`GroupClassDescr`). + + Like the Column class (see :ref:`ColumnClassDescr`), Cols supports item + access to read and write ranges of values in the table or nested column. + + + .. rubric:: Cols attributes + + .. attribute:: _v_colnames + + A list of the names of the columns hanging directly + from the associated table or nested column. The order of + the names matches the order of their respective columns in + the containing table. + + .. attribute:: _v_colpathnames + + A list of the pathnames of all the columns under the + associated table or nested column (in preorder). If it does + not contain nested columns, this is exactly the same as the + :attr:`Cols._v_colnames` attribute. + + .. attribute:: _v_desc + + The associated Description instance (see + :ref:`DescriptionClassDescr`). + + """ + + @property + def _v_table(self): + """The parent Table instance (see :ref:`TableClassDescr`).""" + return self._v__tableFile._get_node(self._v__tablePath) + + def __init__(self, table, desc): + myDict = self.__dict__ + myDict['_v__tableFile'] = table._v_file + myDict['_v__tablePath'] = table._v_pathname + myDict['_v_desc'] = desc + myDict['_v_colnames'] = desc._v_names + myDict['_v_colpathnames'] = table.description._v_pathnames + # Put the column in the local dictionary + for name in desc._v_names: + if name in desc._v_types: + myDict[name] = Column(table, name, desc) + else: + myDict[name] = Cols(table, desc._v_colobjects[name]) + + def _g_update_table_location(self, table): + """Updates the location information about the associated `table`.""" + + myDict = self.__dict__ + myDict['_v__tableFile'] = table._v_file + myDict['_v__tablePath'] = table._v_pathname + + # Update the locations in individual columns. + for colname in self._v_colnames: + myDict[colname]._g_update_table_location(table) + + def __len__(self): + """Get the number of top level columns in table.""" + + return len(self._v_colnames) + + def _f_col(self, colname): + """Get an accessor to the column colname. + + This method returns a Column instance (see :ref:`ColumnClassDescr`) if + the requested column is not nested, and a Cols instance (see + :ref:`ColsClassDescr`) if it is. You may use full column pathnames in + colname. + + Calling cols._f_col('col1/col2') is equivalent to using cols.col1.col2. + However, the first syntax is more intended for programmatic use. It is + also better if you want to access columns with names that are not valid + Python identifiers. + + """ + + if not isinstance(colname, str): + raise TypeError("Parameter can only be an string. You passed " + "object: %s" % colname) + if ((colname.find('/') > -1 and + colname not in self._v_colpathnames) and + colname not in self._v_colnames): + raise KeyError(("Cols accessor ``%s.cols%s`` does not have a " + "column named ``%s``") + % (self._v__tablePath, self._v_desc._v_pathname, + colname)) + + return self._g_col(colname) + + def _g_col(self, colname): + """Like `self._f_col()` but it does not check arguments.""" + + # Get the Column or Description object + inames = colname.split('/') + cols = self + for iname in inames: + cols = cols.__dict__[iname] + return cols + + def __getitem__(self, key): + """Get a row or a range of rows from a table or nested column. + + If key argument is an integer, the corresponding nested type row is + returned as a record of the current flavor. If key is a slice, the + range of rows determined by it is returned as a structured array of the + current flavor. + + Examples + -------- + + :: + + record = table.cols[4] # equivalent to table[4] + recarray = table.cols.Info[4:1000:2] + + Those statements are equivalent to:: + + nrecord = table.read(start=4)[0] + nrecarray = table.read(start=4, stop=1000, step=2).field('Info') + + Here you can see how a mix of natural naming, indexing and slicing can + be used as shorthands for the :meth:`Table.read` method. + + """ + + table = self._v_table + nrows = table.nrows + if is_idx(key): + key = operator.index(key) + + # Index out of range protection + if key >= nrows: + raise IndexError("Index out of range") + if key < 0: + # To support negative values + key += nrows + (start, stop, step) = table._process_range(key, key + 1, 1) + colgroup = self._v_desc._v_pathname + if colgroup == "": # The root group + return table.read(start, stop, step)[0] + else: + crecord = table.read(start, stop, step)[0] + return crecord[colgroup] + elif isinstance(key, slice): + (start, stop, step) = table._process_range( + key.start, key.stop, key.step) + colgroup = self._v_desc._v_pathname + if colgroup == "": # The root group + return table.read(start, stop, step) + else: + crecarray = table.read(start, stop, step) + if hasattr(crecarray, "field"): + return crecarray.field(colgroup) # RecArray case + else: + return get_nested_field(crecarray, colgroup) # numpy case + else: + raise TypeError(f"invalid index or slice: {key!r}") + + def __setitem__(self, key, value): + """Set a row or a range of rows in a table or nested column. + + If key argument is an integer, the corresponding row is set to + value. If key is a slice, the range of rows determined by it is set to + value. + + Examples + -------- + + :: + + table.cols[4] = record + table.cols.Info[4:1000:2] = recarray + + Those statements are equivalent to:: + + table.modify_rows(4, rows=record) + table.modify_column(4, 1000, 2, colname='Info', column=recarray) + + Here you can see how a mix of natural naming, indexing and slicing + can be used as shorthands for the :meth:`Table.modify_rows` and + :meth:`Table.modify_column` methods. + + """ + + table = self._v_table + nrows = table.nrows + if is_idx(key): + key = operator.index(key) + + # Index out of range protection + if key >= nrows: + raise IndexError("Index out of range") + if key < 0: + # To support negative values + key += nrows + (start, stop, step) = table._process_range(key, key + 1, 1) + elif isinstance(key, slice): + (start, stop, step) = table._process_range( + key.start, key.stop, key.step) + else: + raise TypeError(f"invalid index or slice: {key!r}") + + # Actually modify the correct columns + colgroup = self._v_desc._v_pathname + if colgroup == "": # The root group + table.modify_rows(start, stop, step, rows=value) + else: + table.modify_column( + start, stop, step, colname=colgroup, column=value) + + def _g_close(self): + # First, close the columns (ie possible indices open) + for col in self._v_colnames: + colobj = self._g_col(col) + if isinstance(colobj, Column): + colobj.close() + # Delete the reference to column + del self.__dict__[col] + else: + colobj._g_close() + + self.__dict__.clear() + + def __str__(self): + """The string representation for this object.""" + + # The pathname + descpathname = self._v_desc._v_pathname + if descpathname: + descpathname = "." + descpathname + return (f"{self._v__tablePath}.cols{descpathname} " + f"({self.__class__.__name__}), " + f"{len(self._v_colnames)} columns") + + def __repr__(self): + """A detailed string representation for this object.""" + + lines = [f'{self!s}'] + for name in self._v_colnames: + # Get this class name + classname = getattr(self, name).__class__.__name__ + # The type + if name in self._v_desc._v_dtypes: + tcol = self._v_desc._v_dtypes[name] + # The shape for this column + shape = (self._v_table.nrows,) + \ + self._v_desc._v_dtypes[name].shape + else: + tcol = "Description" + # Description doesn't have a shape currently + shape = () + lines.append(f" {name} ({classname}{shape}, {tcol})") + return '\n'.join(lines) + '\n' + + +class Column: + """Accessor for a non-nested column in a table. + + Each instance of this class is associated with one *non-nested* column of a + table. These instances are mainly used to read and write data from the + table columns using item access (like the Cols class - see + :ref:`ColsClassDescr`), but there are a few other associated methods to + deal with indexes. + + .. rubric:: Column attributes + + .. attribute:: descr + + The Description (see :ref:`DescriptionClassDescr`) instance of the + parent table or nested column. + + .. attribute:: name + + The name of the associated column. + + .. attribute:: pathname + + The complete pathname of the associated column (the same as + Column.name if the column is not inside a nested column). + + Parameters + ---------- + table + The parent table instance + name + The name of the column that is associated with this object + descr + The parent description object + + """ + + @lazyattr + def dtype(self): + """The NumPy dtype that most closely matches this column.""" + + return self.descr._v_dtypes[self.name].base # Get rid of shape info + + @lazyattr + def type(self): + """The PyTables type of the column (a string).""" + + return self.descr._v_types[self.name] + + @property + def table(self): + """The parent Table instance (see :ref:`TableClassDescr`).""" + return self._table_file._get_node(self._table_path) + + @property + def index(self): + """The Index instance (see :ref:`IndexClassDescr`) associated with this + column (None if the column is not indexed).""" + indexPath = _index_pathname_of_column_(self._table_path, self.pathname) + try: + index = self._table_file._get_node(indexPath) + except NodeError: + index = None # The column is not indexed + return index + + @lazyattr + def _itemtype(self): + return self.descr._v_dtypes[self.name] + + @property + def shape(self): + """The shape of this column.""" + return (self.table.nrows,) + self.descr._v_dtypes[self.name].shape + + @property + def is_indexed(self): + """True if the column is indexed, false otherwise.""" + if self.index is None: + return False + else: + return True + + @property + def maindim(self): + """"The dimension along which iterators work. Its value is 0 (i.e. the + first dimension).""" + return 0 + + def __init__(self, table, name, descr): + self._table_file = table._v_file + self._table_path = table._v_pathname + self.name = name + """The name of the associated column.""" + self.pathname = descr._v_colobjects[name]._v_pathname + """The complete pathname of the associated column (the same as + Column.name if the column is not inside a nested column).""" + self.descr = descr + """The Description (see :ref:`DescriptionClassDescr`) instance of the + parent table or nested column.""" + + def _g_update_table_location(self, table): + """Updates the location information about the associated `table`.""" + + self._table_file = table._v_file + self._table_path = table._v_pathname + + def __len__(self): + """Get the number of elements in the column. + + This matches the length in rows of the parent table. + + """ + + return self.table.nrows + + def __getitem__(self, key): + """Get a row or a range of rows from a column. + + If key argument is an integer, the corresponding element in the column + is returned as an object of the current flavor. If key is a slice, the + range of elements determined by it is returned as an array of the + current flavor. + + Examples + -------- + + :: + + print("Column handlers:") + for name in table.colnames: + print(table.cols._f_col(name)) + print("Select table.cols.name[1]-->", table.cols.name[1]) + print("Select table.cols.name[1:2]-->", table.cols.name[1:2]) + print("Select table.cols.name[:]-->", table.cols.name[:]) + print("Select table.cols._f_col('name')[:]-->", + table.cols._f_col('name')[:]) + + The output of this for a certain arbitrary table is:: + + Column handlers: + /table.cols.name (Column(), string, idx=None) + /table.cols.lati (Column(), int32, idx=None) + /table.cols.longi (Column(), int32, idx=None) + /table.cols.vector (Column(2,), int32, idx=None) + /table.cols.matrix2D (Column(2, 2), float64, idx=None) + Select table.cols.name[1]--> Particle: 11 + Select table.cols.name[1:2]--> ['Particle: 11'] + Select table.cols.name[:]--> ['Particle: 10' + 'Particle: 11' 'Particle: 12' + 'Particle: 13' 'Particle: 14'] + Select table.cols._f_col('name')[:]--> ['Particle: 10' + 'Particle: 11' 'Particle: 12' + 'Particle: 13' 'Particle: 14'] + + See the :file:`examples/table2.py` file for a more complete example. + + """ + + table = self.table + + # Generalized key support not there yet, but at least allow + # for a tuple with one single element (the main dimension). + # (key,) --> key + if isinstance(key, tuple) and len(key) == 1: + key = key[0] + + if is_idx(key): + key = operator.index(key) + + # Index out of range protection + if key >= table.nrows: + raise IndexError("Index out of range") + if key < 0: + # To support negative values + key += table.nrows + (start, stop, step) = table._process_range(key, key + 1, 1) + return table.read(start, stop, step, self.pathname)[0] + elif isinstance(key, slice): + (start, stop, step) = table._process_range( + key.start, key.stop, key.step) + return table.read(start, stop, step, self.pathname) + else: + raise TypeError( + "'%s' key type is not valid in this context" % key) + + def __iter__(self): + """Iterate through all items in the column.""" + + table = self.table + itemsize = self.dtype.itemsize + nrowsinbuf = table._v_file.params['IO_BUFFER_SIZE'] // itemsize + buf = np.empty((nrowsinbuf, ), self._itemtype) + max_row = len(self) + for start_row in range(0, len(self), nrowsinbuf): + end_row = min(start_row + nrowsinbuf, max_row) + buf_slice = buf[0:end_row - start_row] + table.read(start_row, end_row, 1, field=self.pathname, + out=buf_slice) + yield from buf_slice + + def __setitem__(self, key, value): + """Set a row or a range of rows in a column. + + If key argument is an integer, the corresponding element is set to + value. If key is a slice, the range of elements determined by it is + set to value. + + Examples + -------- + + :: + + # Modify row 1 + table.cols.col1[1] = -1 + + # Modify rows 1 and 3 + table.cols.col1[1::2] = [2,3] + + Which is equivalent to:: + + # Modify row 1 + table.modify_columns(start=1, columns=[[-1]], names=['col1']) + + # Modify rows 1 and 3 + columns = numpy.rec.fromarrays([[2,3]], formats='i4') + table.modify_columns(start=1, step=2, columns=columns, + names=['col1']) + + """ + + table = self.table + table._v_file._check_writable() + + # Generalized key support not there yet, but at least allow + # for a tuple with one single element (the main dimension). + # (key,) --> key + if isinstance(key, tuple) and len(key) == 1: + key = key[0] + + if is_idx(key): + key = operator.index(key) + + # Index out of range protection + if key >= table.nrows: + raise IndexError("Index out of range") + if key < 0: + # To support negative values + key += table.nrows + return table.modify_column(key, key + 1, 1, + [[value]], self.pathname) + elif isinstance(key, slice): + (start, stop, step) = table._process_range( + key.start, key.stop, key.step) + return table.modify_column(start, stop, step, + value, self.pathname) + else: + raise ValueError("Non-valid index or slice: %s" % key) + + def create_index(self, optlevel=6, kind="medium", filters=None, + tmp_dir=None, _blocksizes=None, _testmode=False, + _verbose=False): + """Create an index for this column. + + .. warning:: + + In some situations it is useful to get a completely sorted + index (CSI). For those cases, it is best to use the + :meth:`Column.create_csindex` method instead. + + Parameters + ---------- + optlevel : int + The optimization level for building the index. The levels ranges + from 0 (no optimization) up to 9 (maximum optimization). Higher + levels of optimization mean better chances for reducing the entropy + of the index at the price of using more CPU, memory and I/O + resources for creating the index. + kind : str + The kind of the index to be built. It can take the 'ultralight', + 'light', 'medium' or 'full' values. Lighter kinds ('ultralight' + and 'light') mean that the index takes less space on disk, but will + perform queries slower. Heavier kinds ('medium' and 'full') mean + better chances for reducing the entropy of the index (increasing + the query speed) at the price of using more disk space as well as + more CPU, memory and I/O resources for creating the index. + + Note that selecting a full kind with an optlevel of 9 (the maximum) + guarantees the creation of an index with zero entropy, that is, a + completely sorted index (CSI) - provided that the number of rows in + the table does not exceed the 2**48 figure (that is more than 100 + trillions of rows). See :meth:`Column.create_csindex` method for a + more direct way to create a CSI index. + filters : Filters + Specify the Filters instance used to compress the index. If None, + default index filters will be used (currently, zlib level 1 with + shuffling). + tmp_dir + When kind is other than 'ultralight', a temporary file is created + during the index build process. You can use the tmp_dir argument + to specify the directory for this temporary file. The default is + to create it in the same directory as the file containing the + original table. + + """ + + kinds = ['ultralight', 'light', 'medium', 'full'] + if kind not in kinds: + raise ValueError("Kind must have any of these values: %s" % kinds) + if (not isinstance(optlevel, int) or + (optlevel < 0 or optlevel > 9)): + raise ValueError("Optimization level must be an integer in the " + "range 0-9") + if filters is None: + filters = default_index_filters + if tmp_dir is None: + tmp_dir = str(Path(self._table_file.filename).parent) + else: + if not Path(tmp_dir).is_dir(): + raise ValueError( + f"Temporary directory '{tmp_dir}' does not exist" + ) + if (_blocksizes is not None and + (not isinstance(_blocksizes, tuple) or len(_blocksizes) != 4)): + raise ValueError("_blocksizes must be a tuple with exactly 4 " + "elements") + idxrows = _column__create_index(self, optlevel, kind, filters, + tmp_dir, _blocksizes, _verbose) + return SizeType(idxrows) + + def create_csindex(self, filters=None, tmp_dir=None, + _blocksizes=None, _testmode=False, _verbose=False): + """Create a completely sorted index (CSI) for this column. + + This method guarantees the creation of an index with zero entropy, that + is, a completely sorted index (CSI) -- provided that the number of rows + in the table does not exceed the 2**48 figure (that is more than 100 + trillions of rows). A CSI index is needed for some table methods (like + :meth:`Table.itersorted` or :meth:`Table.read_sorted`) in order to + ensure completely sorted results. + + For the meaning of filters and tmp_dir arguments see + :meth:`Column.create_index`. + + Notes + ----- + This method is equivalent to + Column.create_index(optlevel=9, kind='full', ...). + + """ + + return self.create_index( + kind='full', optlevel=9, filters=filters, tmp_dir=tmp_dir, + _blocksizes=_blocksizes, _testmode=_testmode, _verbose=_verbose) + + def _do_reindex(self, dirty): + """Common code for reindex() and reindex_dirty() codes.""" + + index = self.index + dodirty = True + if dirty and not index.dirty: + dodirty = False + if index is not None and dodirty: + self._table_file._check_writable() + # Get the old index parameters + kind = index.kind + optlevel = index.optlevel + filters = index.filters + # We *need* to tell the index that it is going to be undirty. + # This is needed here so as to unnail() the condition cache. + index.dirty = False + # Delete the existing Index + index._f_remove() + # Create a new Index with the previous parameters + return SizeType(self.create_index( + kind=kind, optlevel=optlevel, filters=filters)) + else: + return SizeType(0) # The column is not intended for indexing + + def reindex(self): + """Recompute the index associated with this column. + + This can be useful when you suspect that, for any reason, + the index information is no longer valid and you want to rebuild it. + + This method does nothing if the column is not indexed. + + """ + + self._do_reindex(dirty=False) + + def reindex_dirty(self): + """Recompute the associated index only if it is dirty. + + This can be useful when you have set :attr:`Table.autoindex` to false + for the table and you want to update the column's index after an + invalidating index operation (like :meth:`Table.remove_rows`). + + This method does nothing if the column is not indexed. + + """ + + self._do_reindex(dirty=True) + + def remove_index(self): + """Remove the index associated with this column. + + This method does nothing if the column is not indexed. The removed + index can be created again by calling the :meth:`Column.create_index` + method. + + """ + + self._table_file._check_writable() + + # Remove the index if existing. + if self.is_indexed: + index = self.index + index._f_remove() + self.table._set_column_indexing(self.pathname, False) + + def close(self): + """Close this column.""" + + self.__dict__.clear() + + def __str__(self): + """The string representation for this object.""" + + return (f"{self._table_path}.cols.{self.pathname.replace('/', '.')} " + f"({self.__class__.__name__}{self.shape}, " + f"{self.descr._v_types[self.name]}, idx={self.index})") + + def __repr__(self): + """A detailed string representation for this object.""" + + return str(self) diff --git a/tables/tableextension.pyx b/tables/tableextension.pyx new file mode 100644 index 0000000..1c03f31 --- /dev/null +++ b/tables/tableextension.pyx @@ -0,0 +1,1653 @@ +######################################################################## +# +# License: BSD +# Created: June 17, 2005 +# Author: Francesc Alted - faltet@pytables.com +# +# $Id$ +# +######################################################################## + +"""Here is where Table and Row extension types live. + +Classes (type extensions): + + Table + Row + +Functions: + +Misc variables: + +""" + +import sys +import numpy +from time import time + +from .description import Col +from .exceptions import HDF5ExtError +from .conditions import call_on_recarr +from .utilsextension import (get_nested_field, atom_from_hdf5_type, + create_nested_type, hdf5_to_np_ext_type, create_nested_type, platform_byteorder, + pttype_to_hdf5, pt_special_kinds, npext_prefixes_to_ptkinds, hdf5_class_to_string, + H5T_STD_I64) +from .utils import SizeType + +from .utilsextension cimport get_native_type, cstr_to_pystr + +# numpy functions & objects +from hdf5extension cimport Leaf +from cpython cimport PyErr_Clear +from libc.stdio cimport snprintf +from libc.stdlib cimport malloc, free +from libc.string cimport memcpy, strdup, strcmp, strlen +from numpy cimport (import_array, ndarray, npy_intp, PyArray_GETITEM, + PyArray_SETITEM, PyArray_BYTES, PyArray_DATA, PyArray_NDIM, PyArray_STRIDE) +from .definitions cimport (hid_t, herr_t, hsize_t, htri_t, hbool_t, + H5F_ACC_RDONLY, H5P_DEFAULT, H5D_CHUNKED, H5T_DIR_DEFAULT, + H5F_SCOPE_LOCAL, H5F_SCOPE_GLOBAL, H5T_COMPOUND, H5Tget_order, + H5Fflush, H5Dget_create_plist, H5T_ORDER_LE, + H5D_layout_t, H5Dopen, H5Dclose, H5Dread, H5Dget_type, H5Dget_space, + H5Pget_layout, H5Pget_chunk, H5Pclose, + H5Sget_simple_extent_ndims, H5Sget_simple_extent_dims, H5Sclose, + H5T_class_t, H5Tget_size, H5Tset_size, H5Tcreate, H5Tcopy, H5Tclose, + H5Tget_nmembers, H5Tget_member_name, H5Tget_member_type, H5Tget_native_type, + H5Tget_member_offset, H5Tinsert, H5Tget_class, H5Tget_super, H5Tget_offset, + H5T_cset_t, H5T_CSET_ASCII, H5T_CSET_UTF8, + H5ATTRset_attribute_string, H5ATTRset_attribute, + get_len_of_range, get_order, set_order, is_complex, + conv_float64_timeval32, truncate_dset, + pt_H5free_memory) + +from .lrucacheextension cimport ObjectCache, NumCache + + + +#----------------------------------------------------------------- + +# Optimized HDF5 API for PyTables +cdef extern from "H5TB-opt.h" nogil: + + herr_t H5TBOmake_table( char *table_title, hid_t loc_id, char *dset_name, + char *version, char *class_, + hid_t mem_type_id, hsize_t nrecords, + hsize_t chunk_size, void *fill_data, int compress, + char *complib, int shuffle, int fletcher32, + hbool_t track_times, void *data ) + + herr_t H5TBOread_records( hid_t dataset_id, hid_t mem_type_id, + hsize_t start, hsize_t nrecords, void *data ) + + herr_t H5TBOread_elements( hid_t dataset_id, hid_t mem_type_id, + hsize_t nrecords, void *coords, void *data ) + + herr_t H5TBOappend_records( hid_t dataset_id, hid_t mem_type_id, + hsize_t nrecords, hsize_t nrecords_orig, + void *data ) + + herr_t H5TBOwrite_records ( hid_t dataset_id, hid_t mem_type_id, + hsize_t start, hsize_t nrecords, + hsize_t step, void *data ) + + herr_t H5TBOwrite_elements( hid_t dataset_id, hid_t mem_type_id, + hsize_t nrecords, void *coords, void *data ) + + herr_t H5TBOdelete_records( hid_t dataset_id, hid_t mem_type_id, + hsize_t ntotal_records, size_t src_size, + hsize_t start, hsize_t nrecords, + hsize_t maxtuples ) + + +#---------------------------------------------------------------------------- + +# Initialization code + +# The numpy API requires this function to be called before +# using any numpy facilities in an extension module. +import_array() + +#------------------------------------------------------------- + + +# Private functions +cdef get_nested_field_cache(recarray, fieldname, fieldcache): + """Get the maybe nested field named `fieldname` from the `recarray`. + + The `fieldname` may be a simple field name or a nested field name with + slah-separated components. It can also be an integer specifying the position + of the field. + + """ + + try: + field = fieldcache[fieldname] + except KeyError: + # Check whether fieldname is an integer and if so, get the field + # straight from the recarray dictionary (it can't be anywhere else) + if isinstance(fieldname, int): + field = recarray[fieldname] + else: + field = get_nested_field(recarray, fieldname) + fieldcache[fieldname] = field + return field + + +cdef join_path(object parent, object name): + if parent == "": + return name + else: + return parent + '/' + name + + +# Public classes + +cdef class Table(Leaf): + # instance variables + cdef void *wbuf + + def _create_table(self, title, complib, obversion): + cdef int offset + cdef int ret + cdef long buflen + cdef hid_t oid + cdef void *data + cdef hsize_t nrows + cdef bytes class_ + cdef ndarray wdflts + cdef void *fill_data + cdef ndarray recarr + cdef object name + cdef bytes encoded_title, encoded_complib, encoded_obversion + cdef char *ctitle = NULL + cdef char *cobversion = NULL + cdef bytes encoded_name + cdef char fieldname[128] + cdef int i + cdef H5T_cset_t cset = H5T_CSET_ASCII + + encoded_title = title.encode('utf-8') + encoded_complib = complib.encode('utf-8') + encoded_obversion = obversion.encode('utf-8') + encoded_name = self.name.encode('utf-8') + + # Get the C pointer + ctitle = encoded_title + cobversion = encoded_obversion + + # Compute the complete compound datatype based on the table description + self.disk_type_id = create_nested_type(self.description, self.byteorder) + #self.type_id = H5Tcopy(self.disk_type_id) + # A H5Tcopy only is not enough, as we want the in-memory type to be + # in the byteorder of the machine (sys.byteorder). + self.type_id = create_nested_type(self.description, sys.byteorder) + + # The fill values area + wdflts = self._v_wdflts + if wdflts is None: + fill_data = NULL + else: + fill_data = PyArray_DATA(wdflts) + + # test if there is data to be saved initially + if self._v_recarray is not None: + recarr = self._v_recarray + data = PyArray_DATA(recarr) + else: + data = NULL + + class_ = self._c_classid.encode('utf-8') + self.dataset_id = H5TBOmake_table(ctitle, self.parent_id, encoded_name, + cobversion, class_, self.disk_type_id, + self.nrows, self.chunkshape[0], + fill_data, + self.filters.complevel, encoded_complib, + self.filters.shuffle_bitshuffle, + self.filters.fletcher32, + self._want_track_times, data) + if self.dataset_id < 0: + raise HDF5ExtError("Problems creating the table") + + if self._v_file.params['PYTABLES_SYS_ATTRS']: + cset = H5T_CSET_UTF8 + # Set the conforming table attributes + # Attach the CLASS attribute + ret = H5ATTRset_attribute_string(self.dataset_id, "CLASS", class_, + len(class_), cset) + if ret < 0: + raise HDF5ExtError("Can't set attribute '%s' in table:\n %s." % + ("CLASS", self.name)) + # Attach the VERSION attribute + ret = H5ATTRset_attribute_string(self.dataset_id, "VERSION", cobversion, + len(encoded_obversion), cset) + if ret < 0: + raise HDF5ExtError("Can't set attribute '%s' in table:\n %s." % + ("VERSION", self.name)) + # Attach the TITLE attribute + ret = H5ATTRset_attribute_string(self.dataset_id, "TITLE", ctitle, + len(encoded_title), cset) + if ret < 0: + raise HDF5ExtError("Can't set attribute '%s' in table:\n %s." % + ("TITLE", self.name)) + # Attach the NROWS attribute + nrows = self.nrows + ret = H5ATTRset_attribute(self.dataset_id, "NROWS", H5T_STD_I64, + 0, NULL, &nrows) + if ret < 0: + raise HDF5ExtError("Can't set attribute '%s' in table:\n %s." % + ("NROWS", self.name)) + + # Attach the FIELD_N_NAME attributes + # We write only the first level names + for i, name in enumerate(self.description._v_names): + snprintf(fieldname, 128, "FIELD_%d_NAME", i) + encoded_name = name.encode('utf-8') + ret = H5ATTRset_attribute_string(self.dataset_id, fieldname, + encoded_name, len(encoded_name), + cset) + if ret < 0: + raise HDF5ExtError("Can't set attribute '%s' in table:\n %s." % + (fieldname, self.name)) + + # If created in PyTables, the table is always chunked + self._chunked = True # Accessible from python + + # Finally, return the object identifier. + return self.dataset_id + + + cdef get_nested_type(self, hid_t type_id, hid_t native_type_id, + object colpath, object field_byteorders): + """Open a nested type and return a nested dictionary as description.""" + + cdef hid_t member_type_id, native_member_type_id, member_offset + cdef hsize_t nfields, i + cdef hsize_t dims[1] + cdef size_t itemsize + cdef char *c_colname + cdef H5T_class_t class_id + cdef char c_byteorder2[11] # "irrelevant" fits easily here + cdef char *sys_byteorder + cdef object desc, colobj, colpath2, typeclassname, typeclass + cdef object byteorder + cdef str colname, byteorder2 + + offset = 0 + desc = {} + # Get the number of members + nfields = H5Tget_nmembers(type_id) + + # Iterate through fields to get the correct order that elements may appear in + # The object type can be stored not in order, so order based on the offset in the data + position_order = [] + for i in range(nfields): + member_offset = H5Tget_member_offset(type_id, i) + position_order.append((member_offset, i)) + + position_order.sort() + + # Iterate thru the members + for pos, i in enumerate([x[1] for x in position_order]): + # Get the member name + c_colname = H5Tget_member_name(type_id, i) + colname = cstr_to_pystr(c_colname) + + # Get the member type + member_type_id = H5Tget_member_type(type_id, i) + # Get the member offset + member_offset = H5Tget_member_offset(type_id, i) + # Get the HDF5 class + class_id = H5Tget_class(member_type_id) + if class_id == H5T_COMPOUND and not is_complex(member_type_id): + colpath2 = join_path(colpath, colname) + # Create the native data in-memory + itemsize = H5Tget_size(member_type_id) + native_member_type_id = H5Tcreate(H5T_COMPOUND, itemsize) + desc[colname], itemsize = self.get_nested_type( + member_type_id, native_member_type_id, colpath2, field_byteorders) + desc[colname]["_v_pos"] = pos + desc[colname]["_v_offset"] = member_offset + else: + # Get the member format and the corresponding Col object + try: + native_member_type_id = get_native_type(member_type_id) + atom = atom_from_hdf5_type(native_member_type_id) + colobj = Col.from_atom(atom, pos=pos, _offset=member_offset) + itemsize = H5Tget_size(native_member_type_id) + except TypeError, te: + # Re-raise TypeError again with more info + raise TypeError( + ("table ``%s``, column ``%s``: %%s" % (self.name, colname)) + % te.args[0]) + desc[colname] = colobj + # For time kinds, save the byteorder of the column + # (useful for conversion of time datatypes later on) + if colobj.kind == "time": + colobj._byteorder = H5Tget_order(member_type_id) + if colobj._byteorder == H5T_ORDER_LE: + field_byteorders.append("little") + else: + field_byteorders.append("big") + elif colobj.kind in ['int', 'uint', 'float', 'complex', 'enum']: + # Keep track of the byteorder for this column + get_order(member_type_id, c_byteorder2) + byteorder2 = cstr_to_pystr(c_byteorder2) + if byteorder2 in ["little", "big"]: + field_byteorders.append(byteorder2) + + # Insert the native member + H5Tinsert(native_type_id, c_colname, member_offset, native_member_type_id) + # Update the offset + offset = offset + itemsize + # Release resources + H5Tclose(native_member_type_id) + H5Tclose(member_type_id) + pt_H5free_memory(c_colname) + + # set the byteorder and other things (just in top level) + if colpath == "": + # Compute a byteorder for the entire table + if len(field_byteorders) > 0: + field_byteorders = numpy.array(field_byteorders) + # Cython doesn't interpret well the extended comparison + # operators so this: field_byteorders == "little" doesn't work + # as expected + if numpy.alltrue(field_byteorders.__eq__("little")): + byteorder = "little" + elif numpy.alltrue(field_byteorders.__eq__("big")): + byteorder = "big" + else: # Yes! someone has done it! + byteorder = "mixed" + else: + byteorder = "irrelevant" + self.byteorder = byteorder + + return desc, offset + + def _get_info(self): + """Get info from a table on disk.""" + + cdef hid_t space_id, plist + cdef size_t type_size, size2 + cdef hsize_t dims[1] # enough for unidimensional tables + cdef hsize_t chunksize[1] + cdef H5D_layout_t layout + cdef bytes encoded_name + + encoded_name = self.name.encode('utf-8') + + # Open the dataset + self.dataset_id = H5Dopen(self.parent_id, encoded_name, H5P_DEFAULT) + if self.dataset_id < 0: + raise HDF5ExtError("Non-existing node ``%s`` under ``%s``" % + (self.name, self._v_parent._v_pathname)) + + # Get the datatype on disk + self.disk_type_id = H5Dget_type(self.dataset_id) + if H5Tget_class(self.disk_type_id) != H5T_COMPOUND: + raise ValueError("Node ``%s`` is not a Table object" % + (self._v_parent._v_leaves[self.name]._v_pathname)) + # Get the number of rows + space_id = H5Dget_space(self.dataset_id) + H5Sget_simple_extent_dims(space_id, dims, NULL) + self.nrows = SizeType(dims[0]) + # Free resources + H5Sclose(space_id) + + # Get the layout of the datatype + plist = H5Dget_create_plist(self.dataset_id) + layout = H5Pget_layout(plist) + if layout == H5D_CHUNKED: + self._chunked = 1 + # Get the chunksize + H5Pget_chunk(plist, 1, chunksize) + else: + self._chunked = 0 + chunksize[0] = 0 + H5Pclose(plist) + + # Get the type size + type_size = H5Tget_size(self.disk_type_id) + # Create the native data in-memory + self.type_id = H5Tcreate(H5T_COMPOUND, type_size) + # Fill-up the (nested) native type and description + desc, offset = self.get_nested_type(self.disk_type_id, self.type_id, "", []) + + if desc == {}: + raise HDF5ExtError("Problems getting desciption for table %s", self.name) + + if offset < type_size: + # Trailing padding, set the itemsize to the correct type_size (see #765) + desc['_v_itemsize'] = type_size + + + # Return the object ID and the description + return (self.dataset_id, desc, SizeType(chunksize[0])) + + cdef _convert_time64_(self, ndarray nparr, hsize_t nrecords, int sense): + """Converts a NumPy of Time64 elements between NumPy and HDF5 formats. + + NumPy to HDF5 conversion is performed when 'sense' is 0. Otherwise, HDF5 + to NumPy conversion is performed. The conversion is done in place, + i.e. 'nparr' is modified. + + """ + + cdef void *t64buf + cdef long byteoffset + cdef npy_intp bytestride, nelements + + byteoffset = 0 # NumPy objects doesn't have an offset + bytestride = PyArray_STRIDE(nparr, 0) # supports multi-dimensional recarray + # Compute the number of elements in the multidimensional cell + nelements = nparr.size // len(nparr) + t64buf = PyArray_DATA(nparr) + + conv_float64_timeval32( + t64buf, byteoffset, bytestride, nrecords, nelements, sense) + + cpdef _convert_types(self, ndarray recarr, hsize_t nrecords, int sense): + """Converts columns in 'recarr' between NumPy and HDF5 formats. + + NumPy to HDF5 conversion is performed when 'sense' is 0. Otherwise, HDF5 + to NumPy conversion is performed. The conversion is done in place, + i.e. 'recarr' is modified. + + """ + + # For reading, first swap the byteorder by hand + # (this is not currently supported by HDF5) + if sense == 1: + for colpathname in self.colpathnames: + if self.coltypes[colpathname] in ["time32", "time64"]: + colobj = self.coldescrs[colpathname] + if hasattr(colobj, "_byteorder"): + if colobj._byteorder != platform_byteorder: + column = get_nested_field(recarr, colpathname) + # Do an *inplace* byteswapping + column.byteswap(True) + + # This should be generalised to support other type conversions. + for t64cname in self._time64colnames: + column = get_nested_field(recarr, t64cname) + self._convert_time64_(column, nrecords, sense) + + def _open_append(self, ndarray recarr): + self._v_recarray = recarr + # Get the pointer to the buffer data area + self.wbuf = PyArray_DATA(recarr) + + def _append_records(self, hsize_t nrecords): + cdef int ret + cdef hsize_t nrows + + # Convert some NumPy types to HDF5 before storing. + self._convert_types(self._v_recarray, nrecords, 0) + + nrows = self.nrows + # release GIL (allow other threads to use the Python interpreter) + with nogil: + # Append the records: + ret = H5TBOappend_records(self.dataset_id, self.type_id, + nrecords, nrows, self.wbuf) + + if ret < 0: + raise HDF5ExtError("Problems appending the records.") + + self.nrows = self.nrows + nrecords + + def _close_append(self): + cdef hsize_t nrows + + if self._v_file.params['PYTABLES_SYS_ATTRS']: + # Update the NROWS attribute + nrows = self.nrows + if (H5ATTRset_attribute(self.dataset_id, "NROWS", H5T_STD_I64, + 0, NULL, &nrows) < 0): + raise HDF5ExtError("Problems setting the NROWS attribute.") + + # Set the caches to dirty (in fact, and for the append case, + # it should be only the caches based on limits, but anyway) + self._dirtycache = True + # Delete the reference to recarray as we doesn't need it anymore + self._v_recarray = None + + def _update_records(self, hsize_t start, hsize_t stop, + hsize_t step, ndarray recarr): + cdef herr_t ret + cdef void *rbuf + cdef hsize_t nrecords, nrows + + # Get the pointer to the buffer data area + rbuf = PyArray_DATA(recarr) + + # Compute the number of records to update + nrecords = len(recarr) + nrows = get_len_of_range(start, stop, step) + if nrecords > nrows: + nrecords = nrows + + # Convert some NumPy types to HDF5 before storing. + self._convert_types(recarr, nrecords, 0) + # Update the records: + with nogil: + ret = H5TBOwrite_records(self.dataset_id, self.type_id, + start, nrecords, step, rbuf ) + + if ret < 0: + raise HDF5ExtError("Problems updating the records.") + + # Set the caches to dirty + self._dirtycache = True + + def _update_elements(self, hsize_t nrecords, ndarray coords, + ndarray recarr): + cdef herr_t ret + cdef void *rbuf + cdef void *rcoords + + # Get the chunk of the coords that correspond to a buffer + rcoords = PyArray_DATA(coords) + + # Get the pointer to the buffer data area + rbuf = PyArray_DATA(recarr) + + # Convert some NumPy types to HDF5 before storing. + self._convert_types(recarr, nrecords, 0) + + # Update the records: + with nogil: + ret = H5TBOwrite_elements(self.dataset_id, self.type_id, + nrecords, rcoords, rbuf) + + if ret < 0: + raise HDF5ExtError("Problems updating the records.") + + # Set the caches to dirty + self._dirtycache = True + + def _read_records(self, hsize_t start, hsize_t nrecords, ndarray recarr): + cdef void *rbuf + cdef int ret + + # Correct the number of records to read, if needed + if (start + nrecords) > self.nrows: + nrecords = self.nrows - start + + # Get the pointer to the buffer data area + rbuf = PyArray_DATA(recarr) + + # Read the records from disk + with nogil: + ret = H5TBOread_records(self.dataset_id, self.type_id, start, + nrecords, rbuf) + + if ret < 0: + raise HDF5ExtError("Problems reading records.") + + # Convert some HDF5 types to NumPy after reading. + self._convert_types(recarr, nrecords, 1) + + return nrecords + + cdef hsize_t _read_chunk(self, hsize_t nchunk, ndarray iobuf, long cstart): + cdef long nslot + cdef hsize_t start, nrecords, chunkshape + cdef int ret + cdef void *rbuf + cdef NumCache chunkcache + + chunkcache = self._chunkcache + chunkshape = chunkcache.slotsize + # Correct the number of records to read, if needed + start = nchunk*chunkshape + nrecords = chunkshape + if (start + nrecords) > self.nrows: + nrecords = self.nrows - start + rbuf = PyArray_BYTES(iobuf) + cstart * chunkcache.itemsize + # Try to see if the chunk is in cache + nslot = chunkcache.getslot_(nchunk) + if nslot >= 0: + chunkcache.getitem_(nslot, rbuf, 0) + else: + # Chunk is not in cache. Read it and put it in the LRU cache. + with nogil: + ret = H5TBOread_records(self.dataset_id, self.type_id, + start, nrecords, rbuf) + + if ret < 0: + raise HDF5ExtError("Problems reading chunk records.") + nslot = chunkcache.setitem_(nchunk, rbuf, 0) + return nrecords + + def _read_elements(self, ndarray coords, ndarray recarr): + cdef long nrecords + cdef void *rbuf + cdef void *rbuf2 + cdef int ret + + # Get the chunk of the coords that correspond to a buffer + nrecords = coords.size + # Get the pointer to the buffer data area + rbuf = PyArray_DATA(recarr) + # Get the pointer to the buffer coords area + rbuf2 = PyArray_DATA(coords) + + with nogil: + ret = H5TBOread_elements(self.dataset_id, self.type_id, + nrecords, rbuf2, rbuf) + + if ret < 0: + raise HDF5ExtError("Problems reading records.") + + # Convert some HDF5 types to NumPy after reading. + self._convert_types(recarr, nrecords, 1) + + return nrecords + + def _remove_rows(self, hsize_t start, hsize_t stop, long step): + cdef size_t rowsize + cdef hsize_t nrecords=0, nrecords2 + cdef hsize_t i + + if step == 1: + nrecords = stop - start + rowsize = self.rowsize + # Using self.disk_type_id should be faster (i.e. less conversions) + if (H5TBOdelete_records(self.dataset_id, self.disk_type_id, + self.nrows, rowsize, start, nrecords, + self.nrowsinbuf) < 0): + raise HDF5ExtError("Problems deleting records.") + + self.nrows = self.nrows - nrecords + if self._v_file.params['PYTABLES_SYS_ATTRS']: + # Attach the NROWS attribute + nrecords2 = self.nrows + H5ATTRset_attribute(self.dataset_id, "NROWS", H5T_STD_I64, + 0, NULL, &nrecords2) + # Set the caches to dirty + self._dirtycache = True + elif step == -1: + nrecords = self._remove_rows(stop+1, start+1, 1) + elif step >= 1: + # always want to go through the space backwards + for i in range(stop - step, start - step, -step): + nrecords += self._remove_rows(i, i+1, 1) + elif step <= -1: + # always want to go through the space backwards + for i in range(start, stop, step): + nrecords += self._remove_rows(i, i+1, 1) + else: + raise ValueError("step size may not be 0.") + + # Return the number of records removed + return nrecords + +cdef class Row: + """Table row iterator and field accessor. + + Instances of this class are used to fetch and set the values + of individual table fields. It works very much like a dictionary, + where keys are the pathnames or positions (extended slicing is + supported) of the fields in the associated table in a specific row. + + This class provides an *iterator interface* + so that you can use the same Row instance to + access successive table rows one after the other. There are also + some important methods that are useful for accessing, adding and + modifying values in tables. + + .. rubric:: Row attributes + + .. attribute:: nrow + + The current row number. + + This property is useful for knowing which row is being dealt with in the + middle of a loop or iterator. + + """ + + cdef npy_intp _stride + cdef long _row, _unsaved_nrows, _mod_nrows + cdef hsize_t start, absstep + cdef long long stop, step, nextelement, _nrow, stopb # has to be long long, not hsize_t, for negative step sizes + cdef hsize_t nrowsinbuf, nrows, nrowsread + cdef hsize_t chunksize, nchunksinbuf, totalchunks + cdef hsize_t startb, lenbuf + cdef long long indexchunk + cdef int bufcounter, counter + cdef int exist_enum_cols + cdef int _riterator, _rowsize, _write_to_seqcache + cdef int wherecond, indexed + cdef int ro_filemode, chunked + cdef int _bufferinfo_done, sss_on + cdef int iterseq_max_elements + cdef ndarray bufcoords, indexvalid, indexvalues, chunkmap + cdef hsize_t *bufcoords_data + cdef hsize_t *index_values_data + cdef char *chunkmap_data + cdef char *index_valid_data + cdef object dtype + cdef object iobuf, iobufcpy + cdef object wrec, wreccpy + cdef object wfields, rfields + cdef object coords + cdef object condfunc, condargs, condkwargs + cdef object mod_elements, colenums + cdef object rfieldscache, wfieldscache + cdef object iterseq + cdef object _table_file, _table_path + cdef object modified_fields + cdef object seqcache_key + + # The nrow() method has been converted into a property, which is handier + property nrow: + """The current row number. + + This property is useful for knowing which row is being dealt with in the + middle of a loop or iterator. + + """ + + def __get__(self): + return SizeType(self._nrow) + + property table: + def __get__(self): + self._table_file._check_open() + return self._table_file._get_node(self._table_path) + + def __cinit__(self, table): + cdef int nfields, i + # Location-dependent information. + self._table_file = table._v_file + self._table_path = table._v_pathname + self._unsaved_nrows = 0 + self._mod_nrows = 0 + self._row = 0 + self._nrow = 0 # Useful in mod_append read iterators + self._riterator = 0 + self._bufferinfo_done = 0 + # Some variables from table will be cached here + if table._v_file.mode == 'r': + self.ro_filemode = 1 + else: + self.ro_filemode = 0 + self.chunked = table._chunked + self.colenums = table._colenums + self.exist_enum_cols = len(self.colenums) + self.nrowsinbuf = table.nrowsinbuf + self.chunksize = table.chunkshape[0] + self.nchunksinbuf = self.nrowsinbuf // self.chunksize + self.dtype = table._v_dtype + self._new_buffer(table) + self.mod_elements = None + self.rfieldscache = {} + self.wfieldscache = {} + self.modified_fields = set() + + def _iter(self, start=0, stop=0, step=1, coords=None, chunkmap=None): + """Return an iterator for traversiong the data in table.""" + self._init_loop(start, stop, step, coords, chunkmap) + return iter(self) + + def __iter__(self): + """Iterator that traverses all the data in the Table""" + return self + + cdef _new_buffer(self, table): + """Create the recarrays for I/O buffering""" + + wdflts = table._v_wdflts + if wdflts is None: + self.wrec = numpy.zeros(1, dtype=self.dtype) # Defaults are zero + else: + self.wrec = table._v_wdflts.copy() + self.wreccpy = self.wrec.copy() # A copy of the defaults + # Build the wfields dictionary for faster access to columns + self.wfields = {} + for name in self.dtype.names: + self.wfields[name] = self.wrec[name] + + # Get the read buffer for this instance (it is private, remember!) + buff = self.iobuf = table._get_container(self.nrowsinbuf) + # Build the rfields dictionary for faster access to columns + # This is quite fast, as it only takes around 5 us per column + # in my laptop (Pentium 4 @ 2 GHz). + # F. Alted 2006-08-18 + self.rfields = {} + for i, name in enumerate(self.dtype.names): + self.rfields[i] = buff[name] + self.rfields[name] = buff[name] + + # Get the stride of these buffers + self._stride = PyArray_STRIDE(buff, 0) + # The rowsize + self._rowsize = self.dtype.itemsize + self.nrows = table.nrows # This value may change + + cdef _init_loop(self, hsize_t start, long long stop, long long step, + object coords, object chunkmap): + """Initialization for the __iter__ iterator""" + table = self.table + self._riterator = 1 # We are inside a read iterator + self.start = start + self.stop = stop + self.step = step + self.coords = coords + self.startb = 0 + if step > 0: + self._row = -1 # a sentinel + self.nrowsread = start + elif step < 0: + self._row = 0 + self.nrowsread = 0 + self.nextelement = start + self._nrow = start - self.step + self.wherecond = 0 + self.indexed = 0 + + self.nrows = table.nrows # Update the row counter + + if coords is not None and 0 < step: + self.nrowsread = start + self.nextelement = start + self.stop = min(stop, len(coords)) + self.absstep = abs(step) + return + elif coords is not None and 0 > step: + #self.nrowsread = 0 + #self.nextelement = start + #self.stop = min(stop, len(coords)) + #self.stop = max(stop, start - len(coords)) + self.absstep = abs(step) + return + + if table._where_condition: + self.wherecond = 1 + #self.condkwargs = {'ex_uses_vml': True} + self.condfunc, self.condargs, self.condkwargs = table._where_condition + table._where_condition = None + + if table._use_index: + # Indexing code depends on this condition (see #319) + assert self.nrowsinbuf % self.chunksize == 0 + self.indexed = 1 + # Compute totalchunks here because self.nrows can change during the + # life of a Row instance. + self.totalchunks = self.nrows // self.chunksize + if self.nrows % self.chunksize: + self.totalchunks = self.totalchunks + 1 + self.nrowsread = 0 + self.nextelement = 0 + self.chunkmap = chunkmap + self.chunkmap_data = PyArray_BYTES(self.chunkmap) + table._use_index = False + self.lenbuf = self.nrowsinbuf + # Check if we have limitations on start, stop, step + self.sss_on = (self.start > 0 or self.stop < self.nrows or self.step > 1) + + self.seqcache_key = table._seqcache_key + table._seqcache_key = None + if self.seqcache_key is not None: + self._write_to_seqcache = 1 + self.iterseq_max_elements = table._v_file.params['ITERSEQ_MAX_ELEMENTS'] + self.iterseq = [] # all the row indexes, unless it would be longer than ITERSEQ_MAX_ELEMENTS + else: + self._write_to_seqcache = 0 + self.iterseq = None + + def __next__(self): + """next() method for __iter__() that is called on each iteration""" + + if not self._riterator: + # The iterator is already exhausted! + raise StopIteration + if self.indexed: + return self.__next__indexed() + elif self.coords is not None: + return self.__next__coords() + elif self.wherecond: + return self.__next__inkernel() + else: + return self.__next__general() + + cdef __next__indexed(self): + """The version of next() for indexed columns and a chunkmap.""" + + cdef long recout, j, cs, vlen, rowsize + cdef hsize_t nchunksread + cdef object tmp_range + cdef Table table + cdef ndarray iobuf + cdef void *IObufData + cdef long nslot + cdef object seq + cdef object seqcache + + assert self.nrowsinbuf >= self.chunksize + while self.nextelement < self.stop: + if self.nextelement >= self.nrowsread: + # Skip until there is interesting information + while self.start > self.nrowsread + self.nrowsinbuf: + self.nrowsread = self.nrowsread + self.nrowsinbuf + self.nextelement = self.nextelement + self.nrowsinbuf + + table = self.table + iobuf = self.iobuf + j = 0; recout = 0; cs = self.chunksize + nchunksread = self.nrowsread // cs + tmp_range = numpy.arange(0, cs, dtype='int64') + self.bufcoords = numpy.empty(self.nrowsinbuf, dtype='int64') + # Fetch valid chunks until the I/O buffer is full + while nchunksread < self.totalchunks: + if self.chunkmap_data[nchunksread]: + self.bufcoords[j*cs:(j+1)*cs] = tmp_range + self.nrowsread + # Not optimized read + # recout = recout + table._read_records( + # nchunksread*cs, cs, iobuf[j*cs:]) + # + # Optimized read through the use of a chunk cache. This cache has + # more or less the same speed than the integrated HDF5 chunk + # cache, but using the PyTables one has the advantage that the + # user can easily change this parameter. + recout = recout + table._read_chunk(nchunksread, iobuf, j*cs) + j = j + 1 + self.nrowsread = (nchunksread+1)*cs + if self.nrowsread > self.stop: + self.nrowsread = self.stop + break + elif j == self.nchunksinbuf: + break + nchunksread = nchunksread + 1 + + # Evaluate the condition on this table fragment. + iobuf = iobuf[:recout] + + if len(iobuf) > 0: + self.table._convert_types(iobuf, len(iobuf), 1) + self.indexvalid = call_on_recarr( + self.condfunc, self.condargs, iobuf, **self.condkwargs) + self.index_valid_data = PyArray_BYTES(self.indexvalid) + # Get the valid coordinates + self.indexvalues = self.bufcoords[:recout][self.indexvalid] + self.index_values_data = PyArray_DATA(self.indexvalues) + self.lenbuf = self.indexvalues.size + # Place the valid results at the beginning of the buffer + iobuf[:self.lenbuf] = iobuf[self.indexvalid] + + # Initialize the internal buffer row counter + self._row = -1 + + if self._write_to_seqcache: + # Feed the indexvalues into the seqcache + seqcache = self.iterseq + if self.lenbuf + len(seqcache) < self.iterseq_max_elements: + seqcache.extend(self.indexvalues) + else: + self.iterseq = None + self._write_to_seqcache = 0 + + self._row = self._row + 1 + # Check whether we have read all the rows in buf + if self._row == self.lenbuf: + self.nextelement = self.nrowsread + # Make _row to point to the last valid entry in buffer + # (this is useful for accessing the last row after an iterator loop) + self._row = self._row - 1 + continue + self._nrow = self.index_values_data[self._row] + # Check additional conditions on start, stop, step params + if self.sss_on: + if (self._nrow < self.start or self._nrow >= self.stop): + self.nextelement = self.nextelement + 1 + continue + if (self.step > 1 and + ((self._nrow - self.start) % self.step > 0)): + self.nextelement = self.nextelement + 1 + continue + # Return this row + self.nextelement = self._nrow + 1 + return self + else: + # All the elements have been read for this mode + self._finish_riterator() + + cdef __next__coords(self): + """The version of next() for user-required coordinates""" + cdef int recout + cdef long long lenbuf, nextelement + cdef object tmp + if 0 < self.step: + while self.nextelement < self.stop: + if self.nextelement >= self.nrowsread: + # Correction for avoiding reading past self.stop + if self.nrowsread+self.nrowsinbuf > self.stop: + lenbuf = self.stop-self.nrowsread + else: + lenbuf = self.nrowsinbuf + tmp = self.coords[self.nrowsread:self.nrowsread+lenbuf:self.step] + # We have to get a contiguous buffer, so numpy.array is the way to go + self.bufcoords = numpy.array(tmp, dtype="uint64") + self._row = -1 + if self.bufcoords.size > 0: + recout = self.table._read_elements(self.bufcoords, self.iobuf) + else: + recout = 0 + self.bufcoords_data = PyArray_DATA(self.bufcoords) + self.nrowsread = self.nrowsread + lenbuf + if recout == 0: + # no items were read, skip out + continue + self._row = self._row + 1 + self._nrow = self.bufcoords_data[self._row] + self.nextelement = self.nextelement + self.absstep + return self + else: + # All the elements have been read for this mode + self._finish_riterator() + elif 0 > self.step: + #print("self.nextelement = ", self.nextelement, self.start, self.nrowsread, self.nextelement < self.start - self.nrowsread + 1) + while self.nextelement > self.stop: + if self.nextelement < self.start - ( self.nrowsread) + 1: + if 0 > self.nextelement - ( self.nrowsinbuf) + 1: + tmp = self.coords[0:self.nextelement + 1] + else: + tmp = self.coords[self.nextelement - ( self.nrowsinbuf) + 1:self.nextelement + 1] + self.bufcoords = numpy.array(tmp, dtype="uint64") + recout = self.table._read_elements(self.bufcoords, self.iobuf) + self.bufcoords_data = PyArray_DATA(self.bufcoords) + self.nrowsread = self.nrowsread + self.nrowsinbuf + self._row = len(self.bufcoords) - 1 + else: + self._row = (self._row + self.step) % len(self.bufcoords) + + self._nrow = self.nextelement - self.step + self.nextelement = self.nextelement + self.step + # Return this value + return self + else: + # All the elements have been read for this mode + self._finish_riterator() + else: + self._finish_riterator() + + cdef __next__inkernel(self): + """The version of next() in case of in-kernel conditions""" + + cdef hsize_t recout, correct + cdef object numexpr_locals, colvar, col + self.nextelement = self._nrow + self.step + while self.nextelement < self.stop: + if self.nextelement >= self.nrowsread: + # Skip until there is interesting information + while self.nextelement >= self.nrowsread + self.nrowsinbuf: + self.nrowsread = self.nrowsread + self.nrowsinbuf + # Compute the end for this iteration + self.stopb = self.stop - self.nrowsread + if self.stopb > self.nrowsinbuf: + self.stopb = self.nrowsinbuf + self._row = self.startb - self.step + # Read a chunk + recout = self.table._read_records(self.nextelement, self.nrowsinbuf, + self.iobuf) + self.nrowsread = self.nrowsread + recout + self.indexchunk = -self.step + + # Evaluate the condition on this table fragment. + self.indexvalid = call_on_recarr( + self.condfunc, self.condargs, self.iobuf[:recout], **self.condkwargs) + self.index_valid_data = PyArray_BYTES(self.indexvalid) + + # Is there any interesting information in this buffer? + if not numpy.sometrue(self.indexvalid): + # No, so take the next one + if self.step >= self.nrowsinbuf: + self.nextelement = self.nextelement + self.step + else: + self.nextelement = self.nextelement + self.nrowsinbuf + # Correction for step size > 1 + if self.step > 1: + correct = (self.nextelement - self.start) % self.step + self.nextelement = self.nextelement - correct + continue + + self._row = self._row + self.step + self._nrow = self.nextelement + if self._row + self.step >= self.stopb: + # Compute the start row for the next buffer + self.startb = 0 + + self.nextelement = self._nrow + self.step + # Return only if this value is interesting + self.indexchunk = self.indexchunk + self.step + if self.index_valid_data[self.indexchunk]: + return self + else: + self._finish_riterator() + + cdef __next__general(self): + """The version of next() for the general cases""" + cdef int recout + if 0 < self.step: + self.nextelement = self._nrow + self.step + while self.nextelement < self.stop: + if self.nextelement >= self.nrowsread: + # Skip until there is interesting information + while self.nextelement >= self.nrowsread + self.nrowsinbuf: + self.nrowsread = self.nrowsread + self.nrowsinbuf + # Compute the end for this iteration + self.stopb = self.stop - self.nrowsread + if self.stopb > self.nrowsinbuf: + self.stopb = self.nrowsinbuf + self._row = self.startb - self.step + # Read a chunk + recout = self.table._read_records(self.nrowsread, self.nrowsinbuf, + self.iobuf) + self.nrowsread = self.nrowsread + recout + + self._row = self._row + self.step + self._nrow = self.nextelement + if self._row + self.step >= self.stopb: + # Compute the start row for the next buffer + self.startb = (self._row + self.step) % self.nrowsinbuf + + self.nextelement = self._nrow + self.step + # Return this value + return self + else: + self._finish_riterator() + elif 0 > self.step: + self.stopb = -1 + while self.nextelement - 1 > self.stop: + if self.nextelement < self.start - self.nrowsread + 1: + # Read a chunk + recout = self.table._read_records(self.nextelement - self.nrowsinbuf + 1, + self.nrowsinbuf, self.iobuf) + self.nrowsread = self.nrowsread + self.nrowsinbuf + self._row = self.nrowsinbuf - 1 + else: + self._row = (self._row + self.step) % self.nrowsinbuf + + self._nrow = self.nextelement - self.step + self.nextelement = self.nextelement + self.step + # Return this value + return self + else: + self._finish_riterator() + + cdef _finish_riterator(self): + """Clean-up things after iterator has been done""" + cdef ObjectCache seqcache + + self.rfieldscache = {} # empty rfields cache + self.wfieldscache = {} # empty wfields cache + # Make a copy of the last read row in the private record + # (this is useful for accessing the last row after an iterator loop) + if self._row >= 0: + self.wrec[:] = self.iobuf[self._row] + if self._write_to_seqcache: + seqcache = self.table._seqcache + # Guessing iterseq size: Each element in self.iterseq should take at least 8 bytes + seqcache.setitem_(self.seqcache_key, self.iterseq, len(self.iterseq) * 8) + self._riterator = 0 # out of iterator + self.iterseq = None # empty seqcache-related things + self.seqcache_key = None + if self._mod_nrows > 0: # Check if there is some modified row + self._flush_mod_rows() # Flush any possible modified row + self.modified_fields = set() # Empty the set of modified fields + raise StopIteration # end of iteration + + def _fill_col(self, result, start, stop, step, field): + """Read a field from a table on disk and put the result in result""" + + cdef hsize_t startr, istartb + cdef hsize_t istart, inrowsinbuf, inextelement + cdef long long stopr, istopb, i, j, inrowsread + cdef long long istop, istep + cdef object fields + + # We can't reuse existing buffers in this context + self._init_loop(start, stop, step, None, None) + istart, istop, istep = self.start, self.stop, self.step + inrowsinbuf, inextelement, inrowsread = self.nrowsinbuf, istart, istart + istartb, startr = self.startb, 0 + i = istart + if 0 < istep: + while i < istop: + if (inextelement >= inrowsread + inrowsinbuf): + inrowsread = inrowsread + inrowsinbuf + i = i + inrowsinbuf + continue + # Compute the end for this iteration + istopb = istop - inrowsread + if istopb > inrowsinbuf: + istopb = inrowsinbuf + stopr = startr + ((istopb - istartb - 1) // istep) + 1 + # Read a chunk + inrowsread = inrowsread + self.table._read_records(i, inrowsinbuf, + self.iobuf) + # Assign the correct part to result + fields = self.iobuf + if field: + fields = get_nested_field(fields, field) + result[startr:stopr] = fields[istartb:istopb:istep] + + # Compute some indexes for the next iteration + startr = stopr + j = istartb + ((istopb - istartb - 1) // istep) * istep + istartb = (j+istep) % inrowsinbuf + inextelement = inextelement + istep + i = i + inrowsinbuf + elif istep < 0: + inrowsinbuf = self.nrowsinbuf + #istartb = self.startb + istartb = self.nrowsinbuf - 1 + #istopb = self.stopb - 1 + istopb = -1 + startr = 0 + i = istart + inextelement = istart + inrowsread = 0 + while i-1 > istop: + #if (inextelement <= inrowsread + inrowsinbuf): + if (inextelement < i - inrowsinbuf): + inrowsread = inrowsread + inrowsinbuf + i = i - inrowsinbuf + continue + # Compute the end for this iteration + # (we know we are going backward so try to keep indices positive) + stopr = startr + (1 - istopb + istartb) // (-istep) + # Read a chunk + inrowsread = inrowsread + self.table._read_records(i - inrowsinbuf + 1, + inrowsinbuf, self.iobuf) + # Assign the correct part to result + fields = self.iobuf + if field: + fields = get_nested_field(fields, field) + if istopb >= 0: + result[startr:stopr] = fields[istartb:istopb:istep] + else: + result[startr:stopr] = fields[istartb::istep] + + # Compute some indexes for the next iteration + startr = stopr + istartb = (i - istartb)%inrowsinbuf + inextelement = inextelement + istep + i = i - inrowsinbuf + self._riterator = 0 # out of iterator + return + + + def append(self): + """Add a new row of data to the end of the dataset. + + Once you have filled the proper fields for the current + row, calling this method actually appends the new data to the + *output buffer* (which will eventually be + dumped to disk). If you have not set the value of a field, the + default value of the column will be used. + + .. warning:: + + After completion of the loop in which :meth:`Row.append` has + been called, it is always convenient to make a call to + :meth:`Table.flush` in order to avoid losing the last rows that + may still remain in internal buffers. + + Examples + -------- + + :: + + row = table.row + for i in xrange(nrows): + row['col1'] = i-1 + row['col2'] = 'a' + row['col3'] = -1.0 + row.append() + table.flush() + + """ + cdef ndarray iobuf, wrec, wreccpy + + if self.ro_filemode: + raise IOError("Attempt to write over a file opened in read-only mode") + + if not self.chunked: + raise HDF5ExtError("You cannot append rows to a non-chunked table.", + h5tb=False) + + if self._riterator: + raise NotImplementedError("You cannot append rows when in middle of a table iterator. If what you want is to update records, use Row.update() instead.") + + # Commit the private record into the write buffer + # self.iobuf[self._unsaved_nrows] = self.wrec + # The next is faster + iobuf = self.iobuf; wrec = self.wrec + memcpy(PyArray_BYTES(iobuf) + self._unsaved_nrows * self._stride, + PyArray_BYTES(wrec), self._rowsize) + # Restore the defaults for the private record + # self.wrec[:] = self.wreccpy + # The next is faster + wreccpy = self.wreccpy + memcpy(PyArray_BYTES(wrec), PyArray_BYTES(wreccpy), self._rowsize) + self._unsaved_nrows = self._unsaved_nrows + 1 + # When the buffer is full, flush it + if self._unsaved_nrows == self.nrowsinbuf: + self._flush_buffered_rows() + + def _flush_buffered_rows(self): + if self._unsaved_nrows > 0: + self.table._save_buffered_rows(self.iobuf, self._unsaved_nrows) + # Reset the buffer unsaved counter + self._unsaved_nrows = 0 + + + def _get_unsaved_nrows(self): + return self._unsaved_nrows + + + def update(self): + """Change the data of the current row in the dataset. + + This method allows you to modify values in a table when you are in the + middle of a table iterator like :meth:`Table.iterrows` or + :meth:`Table.where`. + + Once you have filled the proper fields for the current row, calling + this method actually changes data in the *output buffer* (which will + eventually be dumped to disk). If you have not set the value of a + field, its original value will be used. + + .. warning:: + + After completion of the loop in which :meth:`Row.update` has + been called, it is always convenient to make a call to + :meth:`Table.flush` in order to avoid losing changed rows that + may still remain in internal buffers. + + Examples + -------- + + :: + + for row in table.iterrows(step=10): + row['col1'] = row.nrow + row['col2'] = 'b' + row['col3'] = 0.0 + row.update() + table.flush() + + which modifies every tenth row in table. Or:: + + for row in table.where('col1 > 3'): + row['col1'] = row.nrow + row['col2'] = 'b' + row['col3'] = 0.0 + row.update() + table.flush() + + which just updates the rows with values bigger than 3 in the first + column. + + """ + + cdef ndarray iobufcpy, iobuf + + if self.ro_filemode: + raise IOError("Attempt to write over a file opened in read-only mode") + + if not self._riterator: + raise NotImplementedError("You are only allowed to update rows through the Row.update() method if you are in the middle of a table iterator.") + + if self.mod_elements is None: + # Initialize an array for keeping the modified elements + # (just in case Row.update() would be used) + self.mod_elements = numpy.empty(shape=self.nrowsinbuf, dtype=SizeType) + # We need a different copy for self.iobuf here + self.iobufcpy = self.iobuf.copy() + + # Add this row to the list of elements to be modified + self.mod_elements[self._mod_nrows] = self._nrow + # Copy the current buffer row in input to the output buffer + # self.iobufcpy[self._mod_nrows] = self.iobuf[self._row] + # The next is faster + iobufcpy = self.iobufcpy; iobuf = self.iobuf + memcpy(PyArray_BYTES(iobufcpy) + self._mod_nrows * self._stride, + PyArray_BYTES(iobuf) + self._row * self._stride, self._rowsize) + # Increase the modified buffer count by one + self._mod_nrows = self._mod_nrows + 1 + # No point writing seqcache -- Table.flush will invalidate it + # since we no longer know whether this row will meet _where_condition + self._write_to_seqcache = 0 + # When the buffer is full, flush it + if self._mod_nrows == self.nrowsinbuf: + self._flush_mod_rows() + + def _flush_mod_rows(self): + """Flush any possible modified row using Row.update()""" + + table = self.table + # Save the records on disk + table._update_elements(self._mod_nrows, self.mod_elements, self.iobufcpy) + # Reset the counter of modified rows to 0 + self._mod_nrows = 0 + # Mark the modified fields' indexes as dirty. + table._mark_columns_as_dirty(self.modified_fields) + + + def __contains__(self, item): + """__contains__(item) + + A true value is returned if item is found in current row, false + otherwise. + + """ + + return item in self.fetch_all_fields() + + # This method is twice as faster than __getattr__ because there is + # not a lookup in the local dictionary + def __getitem__(self, key): + """__getitem__(key) + + Get the row field specified by the `key`. + + The key can be a string (the name of the field), an integer (the + position of the field) or a slice (the range of field positions). When + key is a slice, the returned value is a *tuple* containing the values + of the specified fields. + + Examples + -------- + + :: + + res = [row['var3'] for row in table.where('var2 < 20')] + + which selects the var3 field for all the rows that fulfil the + condition. Or:: + + res = [row[4] for row in table if row[1] < 20] + + which selects the field in the *4th* position for all the rows that + fulfil the condition. Or:: + + res = [row[:] for row in table if row['var2'] < 20] + + which selects the all the fields (in the form of a *tuple*) for all the + rows that fulfil the condition. Or:: + + res = [row[1::2] for row in table.iterrows(2, 3000, 3)] + + which selects all the fields in even positions (in the form of a + *tuple*) for all the rows in the slice [2:3000:3]. + + """ + + cdef long offset + cdef ndarray field + cdef object row, fields, fieldscache + + if self._riterator: + # If in the middle of an iterator loop, the user probably wants to + # access the read buffer + fieldscache = self.rfieldscache; fields = self.rfields + offset = self._row + else: + # We are not in an iterator loop, so the user probably wants to access + # the write buffer + fieldscache = self.wfieldscache; fields = self.wfields + offset = 0 + + try: + # Check whether this object is in the cache dictionary + field = fieldscache[key] + except (KeyError, TypeError): + try: + # Try to get it from fields (str or int keys) + field = get_nested_field_cache(fields, key, fieldscache) + except TypeError: + # No luck yet. Still, the key can be a slice. + # Fetch the complete row and convert it into a tuple + if self._riterator: + row = self.iobuf[self._row].copy().item() + else: + row = self.wrec[0].copy().item() + # Try with __getitem__() + return row[key] + + if PyArray_NDIM(field) == 1: + # For an scalar it is not needed a copy (immutable object) + return PyArray_GETITEM(field, PyArray_BYTES(field) + offset * self._stride) + else: + # Do a copy of the array, so that it can be overwritten by the user + # without damaging the internal self.rfields buffer + return field[offset].copy() + + # This is slightly faster (around 3%) than __setattr__ + def __setitem__(self, object key, object value): + """__setitem__(key, value) + + Set the key row field to the specified value. + + Differently from its __getitem__() counterpart, in this case key can + only be a string (the name of the field). The changes done via + __setitem__() will not take effect on the data on disk until any of the + :meth:`Row.append` or :meth:`Row.update` methods are called. + + Examples + -------- + + :: + + for row in table.iterrows(step=10): + row['col1'] = row.nrow + row['col2'] = 'b' + row['col3'] = 0.0 + row.update() + table.flush() + + which modifies every tenth row in the table. + + """ + + cdef int ret + cdef long offset + cdef ndarray field + cdef object fields, fieldscache + + if self.ro_filemode: + raise IOError("attempt to write over a file opened in read-only mode") + + if self._riterator: + # If in the middle of an iterator loop, or *after*, the user + # probably wants to access the read buffer + fieldscache = self.rfieldscache; fields = self.rfields + offset = self._row + else: + # We are not in an iterator loop, so the user probably wants to access + # the write buffer + fieldscache = self.wfieldscache; fields = self.wfields + offset = 0 + + # Check validity of enumerated value. + if self.exist_enum_cols: + if key in self.colenums: + enum = self.colenums[key] + for cenval in numpy.asarray(value).flat: + enum(cenval) # raises ``ValueError`` on invalid values + + # Get the field to be modified + field = get_nested_field_cache(fields, key, fieldscache) + if key not in self.modified_fields: + self.modified_fields.add(key) + + # Finally, try to set it to the value + try: + # Optimization for scalar values. This can optimize the writes + # between a 10% and 100%, depending on the number of columns modified + if PyArray_NDIM(field) == 1: + ret = PyArray_SETITEM(field, PyArray_BYTES(field) + offset * self._stride, value) + if ret < 0: + PyErr_Clear() + raise TypeError + ##### End of optimization for scalar values + else: + field[offset] = value + except TypeError: + raise TypeError("invalid type (%s) for column ``%s``" % (type(value), + key)) + + def fetch_all_fields(self): + """Retrieve all the fields in the current row. + + Contrarily to row[:] (see :ref:`RowSpecialMethods`), this returns row + data as a NumPy void scalar. For instance:: + + [row.fetch_all_fields() for row in table.where('col1 < 3')] + + will select all the rows that fulfill the given condition + as a list of NumPy records. + + """ + + # We need to do a cast for recognizing negative row numbers! + if self._nrow < 0: + return ("Warning: Row iterator has not been initialized for table:\n" + " %s\n" + " You will normally want to use this method in iterator " + "contexts." % self.table) + + # Always return a copy of the row so that new data that is written + # in self.iobuf doesn't overwrite the original returned data. + return self.iobuf[self._row].copy() + + def __str__(self): + """Represent the record as an string""" + + # We need to do a cast for recognizing negative row numbers! + if self._nrow < 0: + return ("Warning: Row iterator has not been initialized for table:\n" + " %s\n" + " You will normally want to use this object in iterator " + "contexts." % self.table) + + tablepathname = self.table._v_pathname + classname = self.__class__.__name__ + return "%s.row (%s), pointing to row #%d" % (tablepathname, classname, + self._nrow) + + def __repr__(self): + """Represent the record as an string""" + + return str(self) + +## Local Variables: +## mode: python +## py-indent-offset: 2 +## tab-width: 2 +## fill-column: 78 +## End: diff --git a/tables/tests/Table2_1_lzo_nrv2e_shuffle.h5 b/tables/tests/Table2_1_lzo_nrv2e_shuffle.h5 new file mode 100644 index 0000000..8020100 Binary files /dev/null and b/tables/tests/Table2_1_lzo_nrv2e_shuffle.h5 differ diff --git a/tables/tests/Tables_lzo1.h5 b/tables/tests/Tables_lzo1.h5 new file mode 100644 index 0000000..4d928bc Binary files /dev/null and b/tables/tests/Tables_lzo1.h5 differ diff --git a/tables/tests/Tables_lzo1_shuffle.h5 b/tables/tests/Tables_lzo1_shuffle.h5 new file mode 100644 index 0000000..622518d Binary files /dev/null and b/tables/tests/Tables_lzo1_shuffle.h5 differ diff --git a/tables/tests/Tables_lzo2.h5 b/tables/tests/Tables_lzo2.h5 new file mode 100644 index 0000000..d85e4b1 Binary files /dev/null and b/tables/tests/Tables_lzo2.h5 differ diff --git a/tables/tests/Tables_lzo2_shuffle.h5 b/tables/tests/Tables_lzo2_shuffle.h5 new file mode 100644 index 0000000..6fc6b7d Binary files /dev/null and b/tables/tests/Tables_lzo2_shuffle.h5 differ diff --git a/tables/tests/__init__.py b/tables/tests/__init__.py new file mode 100644 index 0000000..9b0052c --- /dev/null +++ b/tables/tests/__init__.py @@ -0,0 +1,10 @@ +"""Unit tests for PyTables. + +This package contains some modules which provide a ``suite()`` function +(with no arguments) which returns a test suite for some PyTables +functionality. + +""" + +from tables.tests.common import print_versions +from tables.tests.test_suite import test, suite diff --git a/tables/tests/array_mdatom.h5 b/tables/tests/array_mdatom.h5 new file mode 100644 index 0000000..64b0f89 Binary files /dev/null and b/tables/tests/array_mdatom.h5 differ diff --git a/tables/tests/attr-u16.h5 b/tables/tests/attr-u16.h5 new file mode 100644 index 0000000..1de849d Binary files /dev/null and b/tables/tests/attr-u16.h5 differ diff --git a/tables/tests/blosc_bigendian.h5 b/tables/tests/blosc_bigendian.h5 new file mode 100644 index 0000000..dce88b1 Binary files /dev/null and b/tables/tests/blosc_bigendian.h5 differ diff --git a/tables/tests/bug-idx.h5 b/tables/tests/bug-idx.h5 new file mode 100644 index 0000000..005577f Binary files /dev/null and b/tables/tests/bug-idx.h5 differ diff --git a/tables/tests/check_leaks.py b/tables/tests/check_leaks.py new file mode 100644 index 0000000..8ae5f46 --- /dev/null +++ b/tables/tests/check_leaks.py @@ -0,0 +1,337 @@ +from pathlib import Path +from time import perf_counter as clock + +import tables as tb + +tref = clock() +trel = tref + + +def show_mem(explain): + global tref, trel + + for line in Path("/proc/self/status").read_text().splitlines(): + if line.startswith("VmSize:"): + vmsize = int(line.split()[1]) + elif line.startswith("VmRSS:"): + vmrss = int(line.split()[1]) + elif line.startswith("VmData:"): + vmdata = int(line.split()[1]) + elif line.startswith("VmStk:"): + vmstk = int(line.split()[1]) + elif line.startswith("VmExe:"): + vmexe = int(line.split()[1]) + elif line.startswith("VmLib:"): + vmlib = int(line.split()[1]) + + print("\nMemory usage: ******* %s *******" % explain) + print(f"VmSize: {vmsize:>7} kB\tVmRSS: {vmrss:>7} kB") + print(f"VmData: {vmdata:>7} kB\tVmStk: {vmstk:>7} kB") + print(f"VmExe: {vmexe:>7} kB\tVmLib: {vmlib:>7} kB") + print("WallClock time:", clock() - tref, end=' ') + print(" Delta time:", clock() - trel) + trel = clock() + + +def write_group(filename, nchildren, niter): + for i in range(niter): + fileh = tb.open_file(filename, mode="w") + for child in range(nchildren): + fileh.create_group(fileh.root, 'group' + str(child), + "child: %d" % child) + show_mem("After creating. Iter %s" % i) + fileh.close() + show_mem("After close") + + +def read_group(filename, nchildren, niter): + for i in range(niter): + fileh = tb.open_file(filename, mode="r") + for child in range(nchildren): + node = fileh.get_node(fileh.root, 'group' + str(child)) + assert node is not None + # flavor = node._v_attrs.CLASS +# for child in fileh.walk_nodes(): +# pass + show_mem("After reading metadata. Iter %s" % i) + fileh.close() + show_mem("After close") + + +def write_array(filename, nchildren, niter): + for i in range(niter): + fileh = tb.open_file(filename, mode="w") + for child in range(nchildren): + fileh.create_array(fileh.root, 'array' + str(child), + [1, 1], "child: %d" % child) + show_mem("After creating. Iter %s" % i) + fileh.close() + show_mem("After close") + + +def read_array(filename, nchildren, niter): + for i in range(niter): + fileh = tb.open_file(filename, mode="r") + for child in range(nchildren): + node = fileh.get_node(fileh.root, 'array' + str(child)) + # flavor = node._v_attrs.FLAVOR + data = node[:] # Read data + assert data is not None + show_mem("After reading data. Iter %s" % i) + # for child in range(nchildren): + # node = fileh.get_node(fileh.root, 'array' + str(child)) + # flavor = node._v_attrs.FLAVOR + # # flavor = node._v_attrs + # for child in fileh.walk_nodes(): + # pass + # show_mem("After reading metadata. Iter %s" % i) + fileh.close() + show_mem("After close") + + +def write_carray(filename, nchildren, niter): + for i in range(niter): + fileh = tb.open_file(filename, mode="w") + for child in range(nchildren): + fileh.create_carray(fileh.root, 'array' + str(child), + tb.IntAtom(), (2,), "child: %d" % child) + show_mem("After creating. Iter %s" % i) + fileh.close() + show_mem("After close") + + +def read_carray(filename, nchildren, niter): + for i in range(niter): + fileh = tb.open_file(filename, mode="r") + for child in range(nchildren): + node = fileh.get_node(fileh.root, 'array' + str(child)) + # flavor = node._v_attrs.FLAVOR + data = node[:] # Read data + assert data is not None + # print("data-->", data) + show_mem("After reading data. Iter %s" % i) + fileh.close() + show_mem("After close") + + +def write_earray(filename, nchildren, niter): + for i in range(niter): + fileh = tb.open_file(filename, mode="w") + for child in range(nchildren): + ea = fileh.create_earray(fileh.root, 'array' + str(child), + tb.IntAtom(), shape=(0,), + title="child: %d" % child) + ea.append([1, 2, 3]) + show_mem("After creating. Iter %s" % i) + fileh.close() + show_mem("After close") + + +def read_earray(filename, nchildren, niter): + for i in range(niter): + fileh = tb.open_file(filename, mode="r") + for child in range(nchildren): + node = fileh.get_node(fileh.root, 'array' + str(child)) + # flavor = node._v_attrs.FLAVOR + data = node[:] # Read data + assert data is not None + # print("data-->", data) + show_mem("After reading data. Iter %s" % i) + fileh.close() + show_mem("After close") + + +def write_vlarray(filename, nchildren, niter): + for i in range(niter): + fileh = tb.open_file(filename, mode="w") + for child in range(nchildren): + vl = fileh.create_vlarray(fileh.root, 'array' + str(child), + tb.IntAtom(), "child: %d" % child) + vl.append([1, 2, 3]) + show_mem("After creating. Iter %s" % i) + fileh.close() + show_mem("After close") + + +def read_vlarray(filename, nchildren, niter): + for i in range(niter): + fileh = tb.open_file(filename, mode="r") + for child in range(nchildren): + node = fileh.get_node(fileh.root, 'array' + str(child)) + # flavor = node._v_attrs.FLAVOR + data = node[:] # Read data + assert data is not None + # print("data-->", data) + show_mem("After reading data. Iter %s" % i) + fileh.close() + show_mem("After close") + + +def write_table(filename, nchildren, niter): + + class Record(tb.IsDescription): + var1 = tb.IntCol(pos=1) + var2 = tb.StringCol(length=1, pos=2) + var3 = tb.FloatCol(pos=3) + + for i in range(niter): + fileh = tb.open_file(filename, mode="w") + for child in range(nchildren): + t = fileh.create_table(fileh.root, 'table' + str(child), + Record, "child: %d" % child) + t.append([[1, "2", 3.]]) + show_mem("After creating. Iter %s" % i) + fileh.close() + show_mem("After close") + + +def read_table(filename, nchildren, niter): + for i in range(niter): + fileh = tb.open_file(filename, mode="r") + for child in range(nchildren): + node = fileh.get_node(fileh.root, 'table' + str(child)) + # klass = node._v_attrs.CLASS + data = node[:] # Read data + assert data is not None + # print("data-->", data) + show_mem("After reading data. Iter %s" % i) + fileh.close() + show_mem("After close") + + +def write_xtable(filename, nchildren, niter): + + class Record(tb.IsDescription): + var1 = tb.IntCol(pos=1) + var2 = tb.StringCol(length=1, pos=2) + var3 = tb.FloatCol(pos=3) + + for i in range(niter): + fileh = tb.open_file(filename, mode="w") + for child in range(nchildren): + t = fileh.create_table(fileh.root, 'table' + str(child), + Record, "child: %d" % child) + t.append([[1, "2", 3.]]) + t.cols.var1.create_index() + show_mem("After creating. Iter %s" % i) + fileh.close() + show_mem("After close") + + +def read_xtable(filename, nchildren, niter): + for i in range(niter): + fileh = tb.open_file(filename, mode="r") + for child in range(nchildren): + node = fileh.get_node(fileh.root, 'table' + str(child)) + # klass = node._v_attrs.CLASS + # data = node[:] # Read data + # print("data-->", data) + show_mem("After reading data. Iter %s" % i) + fileh.close() + show_mem("After close") + del node + + +if __name__ == '__main__': + import pstats + import argparse + import profile as prof + + def _get_parser(): + parser = argparse.ArgumentParser( + description='Check for PyTables memory leaks.') + parser.add_argument('-v', '--verbose', action='store_true', + help='enable verbose mode') + parser.add_argument('-p', '--profile', action='store_true', + help='profile') + parser.add_argument('-a', '--array', action='store_true', + help='create/read arrays (default)') + parser.add_argument('-c', '--carray', action='store_true', + help='create/read carrays') + parser.add_argument('-e', '--earray', action='store_true', + help='create/read earrays') + parser.add_argument('-l', '--vlarray', action='store_true', + help='create/read vlarrays') + parser.add_argument('-t', '--table', action='store_true', + help='create/read tables') + parser.add_argument('-x', '--indexed-table', action='store_true', + dest='xtable', help='create/read indexed-tables') + parser.add_argument('-g', '--group', action='store_true', + help='create/read groups') + parser.add_argument('-r', '--read', action='store_true', + help='only read test') + parser.add_argument('-w', '--write', action='store_true', + help='only write test') + parser.add_argument('-n', '--nchildren', type=int, default=1000, + help='number of children (%(default)d is the ' + 'default)') + parser.add_argument('-i', '--niter', type=int, default=3, + help='number of iterations (default: %(default)d)') + + parser.add_argument('filename', help='HDF5 file name') + + return parser + + parser = _get_parser() + args = parser.parse_args() + + # set 'array' as default value if no ather option has been specified + for name in ('carray', 'earray', 'vlarray', 'table', 'xtable', 'group'): + if getattr(args, name): + break + else: + args.array = True + + filename = args.filename + nchildren = args.nchildren + niter = args.niter + + if args.array: + fwrite = 'write_array' + fread = 'read_array' + elif args.carray: + fwrite = 'write_carray' + fread = 'read_carray' + elif args.earray: + fwrite = 'write_earray' + fread = 'read_earray' + elif args.vlarray: + fwrite = 'write_vlarray' + fread = 'read_vlarray' + elif args.table: + fwrite = 'write_table' + fread = 'read_table' + elif args.xtable: + fwrite = 'write_xtable' + fread = 'read_xtable' + elif args.group: + fwrite = 'write_group' + fread = 'read_group' + + show_mem("Before open") + if args.write: + if args.profile: + prof.run(str(fwrite)+'(filename, nchildren, niter)', + 'write_file.prof') + stats = pstats.Stats('write_file.prof') + stats.strip_dirs() + stats.sort_stats('time', 'calls') + if args.verbose: + stats.print_stats() + else: + stats.print_stats(20) + else: + eval(fwrite+'(filename, nchildren, niter)') + if args.read: + if args.profile: + prof.run(fread+'(filename, nchildren, niter)', 'read_file.prof') + stats = pstats.Stats('read_file.prof') + stats.strip_dirs() + stats.sort_stats('time', 'calls') + if args.verbose: + print('profile -verbose') + stats.print_stats() + else: + stats.print_stats(20) + else: + eval(fread+'(filename, nchildren, niter)') diff --git a/tables/tests/common.py b/tables/tests/common.py new file mode 100644 index 0000000..7de2a86 --- /dev/null +++ b/tables/tests/common.py @@ -0,0 +1,364 @@ +"""Utilities for PyTables' test suites.""" + +import re +import sys +import locale +import platform +import tempfile +from pathlib import Path +from time import perf_counter as clock +from packaging.version import Version + +import unittest + +import numexpr as ne +import numpy as np + +import tables as tb +from tables.req_versions import min_blosc_bitshuffle_version + +hdf5_version = Version(tb.hdf5_version) +blosc_version = Version(tb.which_lib_version("blosc")[1]) + + +verbose = False +"""Show detailed output of the testing process.""" + +heavy = False +"""Run all tests even when they take long to complete.""" + +show_memory = False +"""Show the progress of memory consumption.""" + + +def parse_argv(argv): + global verbose, heavy + + if 'verbose' in argv: + verbose = True + argv.remove('verbose') + + if 'silent' in argv: # take care of old flag, just in case + verbose = False + argv.remove('silent') + + if '--heavy' in argv: + heavy = True + argv.remove('--heavy') + + return argv + + +zlib_avail = tb.which_lib_version("zlib") is not None +lzo_avail = tb.which_lib_version("lzo") is not None +bzip2_avail = tb.which_lib_version("bzip2") is not None +blosc_avail = tb.which_lib_version("blosc") is not None + + +def print_heavy(heavy): + if heavy: + print("""Performing the complete test suite!""") + else: + print("""\ +Performing only a light (yet comprehensive) subset of the test suite. +If you want a more complete test, try passing the --heavy flag to this script +(or set the 'heavy' parameter in case you are using tables.test() call). +The whole suite will take more than 4 hours to complete on a relatively +modern CPU and around 512 MB of main memory.""") + print('-=' * 38) + + +def print_versions(): + """Print all the versions of software that PyTables relies on.""" + + print('-=' * 38) + print("PyTables version: %s" % tb.__version__) + print("HDF5 version: %s" % tb.which_lib_version("hdf5")[1]) + print("NumPy version: %s" % np.__version__) + tinfo = tb.which_lib_version("zlib") + if ne.use_vml: + # Get only the main version number and strip out all the rest + vml_version = ne.get_vml_version() + vml_version = re.findall("[0-9.]+", vml_version)[0] + vml_avail = "using VML/MKL %s" % vml_version + else: + vml_avail = "not using Intel's VML/MKL" + print(f"Numexpr version: {ne.__version__} ({vml_avail})") + if tinfo is not None: + print(f"Zlib version: {tinfo[1]} (in Python interpreter)") + tinfo = tb.which_lib_version("lzo") + if tinfo is not None: + print("LZO version: {} ({})".format(tinfo[1], tinfo[2])) + tinfo = tb.which_lib_version("bzip2") + if tinfo is not None: + print("BZIP2 version: {} ({})".format(tinfo[1], tinfo[2])) + tinfo = tb.which_lib_version("blosc") + if tinfo is not None: + blosc_date = tinfo[2].split()[1] + print("Blosc version: {} ({})".format(tinfo[1], blosc_date)) + blosc_cinfo = tb.blosc_get_complib_info() + blosc_cinfo = [ + "{} ({})".format(k, v[1]) for k, v in sorted(blosc_cinfo.items()) + ] + print("Blosc compressors: %s" % ', '.join(blosc_cinfo)) + blosc_finfo = ['shuffle'] + if Version(tinfo[1]) >= tb.req_versions.min_blosc_bitshuffle_version: + blosc_finfo.append('bitshuffle') + print("Blosc filters: %s" % ', '.join(blosc_finfo)) + try: + from Cython import __version__ as cython_version + print('Cython version: %s' % cython_version) + except Exception: + pass + print('Python version: %s' % sys.version) + print('Platform: %s' % platform.platform()) + # if os.name == 'posix': + # (sysname, nodename, release, version, machine) = os.uname() + # print('Platform: %s-%s' % (sys.platform, machine)) + print('Byte-ordering: %s' % sys.byteorder) + print('Detected cores: %s' % tb.utils.detect_number_of_cores()) + print('Default encoding: %s' % sys.getdefaultencoding()) + print('Default FS encoding: %s' % sys.getfilesystemencoding()) + print('Default locale: (%s, %s)' % locale.getdefaultlocale()) + print('-=' * 38) + + # This should improve readability whan tests are run by CI tools + sys.stdout.flush() + + +def test_filename(filename): + from pkg_resources import resource_filename + return resource_filename('tables.tests', filename) + + +def verbosePrint(string, nonl=False): + """Print out the `string` if verbose output is enabled.""" + if not verbose: + return + if nonl: + print(string, end=' ') + else: + print(string) + + +def allequal(a, b, flavor="numpy"): + """Checks if two numerical objects are equal.""" + + # print("a-->", repr(a)) + # print("b-->", repr(b)) + if not hasattr(b, "shape"): + # Scalar case + return a == b + + if ((not hasattr(a, "shape") or a.shape == ()) and + (not hasattr(b, "shape") or b.shape == ())): + return a == b + + if a.shape != b.shape: + if verbose: + print("Shape is not equal:", a.shape, "!=", b.shape) + return 0 + + # Way to check the type equality without byteorder considerations + if hasattr(b, "dtype") and a.dtype.str[1:] != b.dtype.str[1:]: + if verbose: + print("dtype is not equal:", a.dtype, "!=", b.dtype) + return 0 + + # Rank-0 case + if len(a.shape) == 0: + if a[()] == b[()]: + return 1 + else: + if verbose: + print("Shape is not equal:", a.shape, "!=", b.shape) + return 0 + + # null arrays + if a.size == 0: # len(a) is not correct for generic shapes + if b.size == 0: + return 1 + else: + if verbose: + print("length is not equal") + print("len(a.data) ==>", len(a.data)) + print("len(b.data) ==>", len(b.data)) + return 0 + + # Multidimensional case + result = (a == b) + result = np.all(result) + if not result and verbose: + print("Some of the elements in arrays are not equal") + + return result + + +def areArraysEqual(arr1, arr2): + """Are both `arr1` and `arr2` equal arrays? + + Arguments can be regular NumPy arrays, chararray arrays or + structured arrays (including structured record arrays). They are + checked for type and value equality. + + """ + + t1 = type(arr1) + t2 = type(arr2) + + if not ((hasattr(arr1, 'dtype') and arr1.dtype == arr2.dtype) or + issubclass(t1, t2) or issubclass(t2, t1)): + return False + + return np.all(arr1 == arr2) + + +class PyTablesTestCase(unittest.TestCase): + def tearDown(self): + super().tearDown() + for key in self.__dict__: + if self.__dict__[key].__class__.__name__ != 'instancemethod': + self.__dict__[key] = None + + def _getName(self): + """Get the name of this test case.""" + return self.id().split('.')[-2] + + def _getMethodName(self): + """Get the name of the method currently running in the test case.""" + return self.id().split('.')[-1] + + def _verboseHeader(self): + """Print a nice header for the current test method if verbose.""" + + if verbose: + name = self._getName() + methodName = self._getMethodName() + + title = f"Running {name}.{methodName}" + print('{}\n{}'.format(title, '-' * len(title))) + + # COMPATIBILITY: assertWarns is new in Python 3.2 + if not hasattr(unittest.TestCase, 'assertWarns'): + def assertWarns(self, expected_warning, callable_obj=None, + *args, **kwargs): + context = _AssertWarnsContext(expected_warning, self, callable_obj) + return context.handle('assertWarns', callable_obj, args, kwargs) + + def _checkEqualityGroup(self, node1, node2, hardlink=False): + if verbose: + print("Group 1:", node1) + print("Group 2:", node2) + if hardlink: + self.assertTrue( + node1._v_pathname != node2._v_pathname, + "node1 and node2 have the same pathnames.") + else: + self.assertTrue( + node1._v_pathname == node2._v_pathname, + "node1 and node2 does not have the same pathnames.") + self.assertTrue( + node1._v_children == node2._v_children, + "node1 and node2 does not have the same children.") + + def _checkEqualityLeaf(self, node1, node2, hardlink=False): + if verbose: + print("Leaf 1:", node1) + print("Leaf 2:", node2) + if hardlink: + self.assertTrue( + node1._v_pathname != node2._v_pathname, + "node1 and node2 have the same pathnames.") + else: + self.assertTrue( + node1._v_pathname == node2._v_pathname, + "node1 and node2 does not have the same pathnames.") + self.assertTrue( + areArraysEqual(node1[:], node2[:]), + "node1 and node2 does not have the same values.") + + +class TestFileMixin: + h5fname = None + open_kwargs = {} + + def setUp(self): + super().setUp() + self.h5file = tb.open_file( + self.h5fname, title=self._getName(), **self.open_kwargs) + + def tearDown(self): + """Close ``h5file``.""" + + self.h5file.close() + super().tearDown() + + +class TempFileMixin: + open_mode = 'w' + open_kwargs = {} + + def _getTempFileName(self): + return tempfile.mktemp(prefix=self._getName(), suffix='.h5') + + def setUp(self): + """Set ``h5file`` and ``h5fname`` instance attributes. + + * ``h5fname``: the name of the temporary HDF5 file. + * ``h5file``: the writable, empty, temporary HDF5 file. + + """ + + super().setUp() + self.h5fname = self._getTempFileName() + self.h5file = tb.open_file( + self.h5fname, self.open_mode, title=self._getName(), + **self.open_kwargs) + + def tearDown(self): + """Close ``h5file`` and remove ``h5fname``.""" + + self.h5file.close() + self.h5file = None + Path(self.h5fname).unlink() # comment this for debug only + super().tearDown() + + def _reopen(self, mode='r', **kwargs): + """Reopen ``h5file`` in the specified ``mode``. + + Returns a true or false value depending on whether the file was + reopenend or not. If not, nothing is changed. + + """ + + self.h5file.close() + self.h5file = tb.open_file(self.h5fname, mode, **kwargs) + return True + + +class ShowMemTime(PyTablesTestCase): + tref = clock() + """Test for showing memory and time consumption.""" + + def test00(self): + """Showing memory and time consumption.""" + + # Obtain memory info (only for Linux 2.6.x) + for line in Path("/proc/self/status").read_text().splitlines(): + if line.startswith("VmSize:"): + vmsize = int(line.split()[1]) + elif line.startswith("VmRSS:"): + vmrss = int(line.split()[1]) + elif line.startswith("VmData:"): + vmdata = int(line.split()[1]) + elif line.startswith("VmStk:"): + vmstk = int(line.split()[1]) + elif line.startswith("VmExe:"): + vmexe = int(line.split()[1]) + elif line.startswith("VmLib:"): + vmlib = int(line.split()[1]) + print("\nWallClock time:", clock() - self.tref) + print("Memory usage: ******* %s *******" % self._getName()) + print(f"VmSize: {vmsize:>7} kB\tVmRSS: {vmrss:>7} kB") + print(f"VmData: {vmdata:>7} kB\tVmStk: {vmstk:>7} kB") + print(f"VmExe: {vmexe:>7} kB\tVmLib: {vmlib:>7} kB") diff --git a/tables/tests/create-nested-type.c b/tables/tests/create-nested-type.c new file mode 100644 index 0000000..870d37b --- /dev/null +++ b/tables/tests/create-nested-type.c @@ -0,0 +1,123 @@ +// This program creates nested types with gaps for testing purposes. +// F. Alted 2008-06-27 + +#include "hdf5.h" +#include + + +hid_t +create_nested_type(void) { + hid_t tid, tid2, tid3; + size_t offset, offset2; + + offset = 1; offset2 = 2; + // Create a coumpound type large enough (>= 20) + tid = H5Tcreate(H5T_COMPOUND, 21); + // Insert an atomic type + tid2 = H5Tcopy(H5T_NATIVE_FLOAT); + H5Tinsert(tid, "float", offset, tid2); + H5Tclose(tid2); + offset += 4 + 2; // add two to the offset so as to create gaps + // Insert a nested compound + tid2 = H5Tcreate(H5T_COMPOUND, 12); + tid3 = H5Tcopy(H5T_NATIVE_CHAR); + H5Tinsert(tid2, "char", offset2, tid3); + H5Tclose(tid3); + offset2 += 2; // add one space (for introducing gaps) + tid3 = H5Tcopy(H5T_NATIVE_DOUBLE); + H5Tinsert(tid2, "double", offset2, tid3); + H5Tclose(tid3); + offset2 += 5; // add one space (for introducing gaps) + H5Tinsert(tid, "compound", offset, tid2); + H5Tclose(tid2); + offset += 12 + 1; + return(tid); +} + +size_t +getNestedSizeType(hid_t type_id) { + hid_t member_type_id; + H5T_class_t class_id; + hsize_t i, nfields; + size_t itemsize, offset; + + nfields = H5Tget_nmembers(type_id); + offset = 0; + // Iterate thru the members + for (i=0; i < nfields; i++) { + // Get the member type + member_type_id = H5Tget_member_type(type_id, i); + // Get the HDF5 class + class_id = H5Tget_class(member_type_id); + if (class_id == H5T_COMPOUND) { + // Get the member size for compound type + itemsize = getNestedSizeType(member_type_id); + } + else { + // Get the atomic member size + itemsize = H5Tget_size(member_type_id); + } + // Update the offset + offset = offset + itemsize; + } + return(offset); +} + + +int +main(int argc, char **argv) +{ + char file_name[256], dset_name[256]; + hid_t file_id, dataset_id, space_id, plist_id, type_id; + hsize_t dims[1], dims_chunk[1]; + hsize_t maxdims[1] = { H5S_UNLIMITED }; + size_t disk_type_size, computed_type_size, packed_type_size; + + if (argc < 3) { + printf("Pass the name of the file and dataset to check as arguments\n"); + return(0); + } + + strcpy(file_name, argv[1]); + strcpy(dset_name, argv[2]); + + dims[0] = 20; // Create 20 records + dims_chunk[0] = 10; + + // Create a new file + file_id = H5Fcreate(file_name, H5F_ACC_TRUNC, H5P_DEFAULT, H5P_DEFAULT); + // Create a simple data space with unlimited size + space_id = H5Screate_simple(1, dims, maxdims); + // Modify dataset creation properties, i.e. enable chunking + plist_id = H5Pcreate (H5P_DATASET_CREATE); + H5Pset_chunk(plist_id, 1, dims_chunk); + // Get the nested type + type_id = create_nested_type(); + // Create the dataset + dataset_id = H5Dcreate(file_id, dset_name, type_id, space_id, + H5P_DEFAULT, plist_id, H5P_DEFAULT); + // Free resources + H5Sclose(space_id); + H5Pclose(plist_id); + H5Dclose(dataset_id); + H5Fclose(file_id); + + // Compute type sizes for native and packed + disk_type_size = H5Tget_size(type_id); + computed_type_size = getNestedSizeType(type_id); + H5Tpack(type_id); // pack type + packed_type_size = H5Tget_size(type_id); + printf("Disk type size: %d\n", disk_type_size); + printf("Packed type size: %d (should be %d)\n", + packed_type_size, computed_type_size); + + H5Tclose(type_id); + + return(1); +} + + + + + + diff --git a/tables/tests/create_backcompat_indexes.py b/tables/tests/create_backcompat_indexes.py new file mode 100644 index 0000000..11fc114 --- /dev/null +++ b/tables/tests/create_backcompat_indexes.py @@ -0,0 +1,41 @@ +# Script for creating different kind of indexes in a small space as possible. +# This is intended for testing purposes. + +import tables as tb + + +class Descr(tb.IsDescription): + var1 = tb.StringCol(itemsize=4, shape=(), dflt='', pos=0) + var2 = tb.BoolCol(shape=(), dflt=False, pos=1) + var3 = tb.Int32Col(shape=(), dflt=0, pos=2) + var4 = tb.Float64Col(shape=(), dflt=0.0, pos=3) + + +# Parameters for the table and index creation +small_chunkshape = (2,) +small_blocksizes = (64, 32, 16, 8) +nrows = 43 + +# Create the new file +h5fname = 'indexes_2_1.h5' +h5file = tb.open_file(h5fname, 'w') +t1 = h5file.create_table(h5file.root, 'table1', Descr) +row = t1.row +for i in range(nrows): + row['var1'] = i + row['var2'] = i + row['var3'] = i + row['var4'] = i + row.append() +t1.flush() + +# Do a copy of table1 +t1.copy(h5file.root, 'table2') + +# Create indexes of all kinds +t1.cols.var1.create_index(0, 'ultralight', _blocksizes=small_blocksizes) +t1.cols.var2.create_index(3, 'light', _blocksizes=small_blocksizes) +t1.cols.var3.create_index(6, 'medium', _blocksizes=small_blocksizes) +t1.cols.var4.create_index(9, 'full', _blocksizes=small_blocksizes) + +h5file.close() diff --git a/tables/tests/elink.h5 b/tables/tests/elink.h5 new file mode 100644 index 0000000..7fdf3ad Binary files /dev/null and b/tables/tests/elink.h5 differ diff --git a/tables/tests/elink2.h5 b/tables/tests/elink2.h5 new file mode 100644 index 0000000..2b5a394 Binary files /dev/null and b/tables/tests/elink2.h5 differ diff --git a/tables/tests/ex-noattr.h5 b/tables/tests/ex-noattr.h5 new file mode 100644 index 0000000..c839038 Binary files /dev/null and b/tables/tests/ex-noattr.h5 differ diff --git a/tables/tests/flavored_vlarrays-format1.6.h5 b/tables/tests/flavored_vlarrays-format1.6.h5 new file mode 100644 index 0000000..5592f8c Binary files /dev/null and b/tables/tests/flavored_vlarrays-format1.6.h5 differ diff --git a/tables/tests/float.h5 b/tables/tests/float.h5 new file mode 100644 index 0000000..6555e3d Binary files /dev/null and b/tables/tests/float.h5 differ diff --git a/tables/tests/idx-std-1.x.h5 b/tables/tests/idx-std-1.x.h5 new file mode 100644 index 0000000..3b28020 Binary files /dev/null and b/tables/tests/idx-std-1.x.h5 differ diff --git a/tables/tests/indexes_2_0.h5 b/tables/tests/indexes_2_0.h5 new file mode 100644 index 0000000..54169b2 Binary files /dev/null and b/tables/tests/indexes_2_0.h5 differ diff --git a/tables/tests/indexes_2_1.h5 b/tables/tests/indexes_2_1.h5 new file mode 100644 index 0000000..88e2e79 Binary files /dev/null and b/tables/tests/indexes_2_1.h5 differ diff --git a/tables/tests/issue_368.h5 b/tables/tests/issue_368.h5 new file mode 100644 index 0000000..4cb0e9a Binary files /dev/null and b/tables/tests/issue_368.h5 differ diff --git a/tables/tests/issue_560.h5 b/tables/tests/issue_560.h5 new file mode 100644 index 0000000..9b42dfd Binary files /dev/null and b/tables/tests/issue_560.h5 differ diff --git a/tables/tests/itemsize.h5 b/tables/tests/itemsize.h5 new file mode 100644 index 0000000..64ac7ed Binary files /dev/null and b/tables/tests/itemsize.h5 differ diff --git a/tables/tests/matlab_file.mat b/tables/tests/matlab_file.mat new file mode 100644 index 0000000..810eabe Binary files /dev/null and b/tables/tests/matlab_file.mat differ diff --git a/tables/tests/nested-type-with-gaps.h5 b/tables/tests/nested-type-with-gaps.h5 new file mode 100644 index 0000000..de4217f Binary files /dev/null and b/tables/tests/nested-type-with-gaps.h5 differ diff --git a/tables/tests/non-chunked-table.h5 b/tables/tests/non-chunked-table.h5 new file mode 100644 index 0000000..a7df4ea Binary files /dev/null and b/tables/tests/non-chunked-table.h5 differ diff --git a/tables/tests/oldflavor_numeric.h5 b/tables/tests/oldflavor_numeric.h5 new file mode 100644 index 0000000..5253468 Binary files /dev/null and b/tables/tests/oldflavor_numeric.h5 differ diff --git a/tables/tests/out_of_order_types.h5 b/tables/tests/out_of_order_types.h5 new file mode 100644 index 0000000..92f4e17 Binary files /dev/null and b/tables/tests/out_of_order_types.h5 differ diff --git a/tables/tests/python2.h5 b/tables/tests/python2.h5 new file mode 100644 index 0000000..10630ff Binary files /dev/null and b/tables/tests/python2.h5 differ diff --git a/tables/tests/python3.h5 b/tables/tests/python3.h5 new file mode 100644 index 0000000..28eccfd Binary files /dev/null and b/tables/tests/python3.h5 differ diff --git a/tables/tests/scalar.h5 b/tables/tests/scalar.h5 new file mode 100644 index 0000000..a6a1012 Binary files /dev/null and b/tables/tests/scalar.h5 differ diff --git a/tables/tests/slink.h5 b/tables/tests/slink.h5 new file mode 100644 index 0000000..b95b7af Binary files /dev/null and b/tables/tests/slink.h5 differ diff --git a/tables/tests/smpl_SDSextendible.h5 b/tables/tests/smpl_SDSextendible.h5 new file mode 100644 index 0000000..7a7bcc2 Binary files /dev/null and b/tables/tests/smpl_SDSextendible.h5 differ diff --git a/tables/tests/smpl_compound_chunked.h5 b/tables/tests/smpl_compound_chunked.h5 new file mode 100644 index 0000000..1cd1d33 Binary files /dev/null and b/tables/tests/smpl_compound_chunked.h5 differ diff --git a/tables/tests/smpl_enum.h5 b/tables/tests/smpl_enum.h5 new file mode 100644 index 0000000..8bc6050 Binary files /dev/null and b/tables/tests/smpl_enum.h5 differ diff --git a/tables/tests/smpl_f64be.h5 b/tables/tests/smpl_f64be.h5 new file mode 100644 index 0000000..5ce30e9 Binary files /dev/null and b/tables/tests/smpl_f64be.h5 differ diff --git a/tables/tests/smpl_f64le.h5 b/tables/tests/smpl_f64le.h5 new file mode 100644 index 0000000..c54b96b Binary files /dev/null and b/tables/tests/smpl_f64le.h5 differ diff --git a/tables/tests/smpl_i32be.h5 b/tables/tests/smpl_i32be.h5 new file mode 100644 index 0000000..c79980a Binary files /dev/null and b/tables/tests/smpl_i32be.h5 differ diff --git a/tables/tests/smpl_i32le.h5 b/tables/tests/smpl_i32le.h5 new file mode 100644 index 0000000..5f24e12 Binary files /dev/null and b/tables/tests/smpl_i32le.h5 differ diff --git a/tables/tests/smpl_i64be.h5 b/tables/tests/smpl_i64be.h5 new file mode 100644 index 0000000..97f518c Binary files /dev/null and b/tables/tests/smpl_i64be.h5 differ diff --git a/tables/tests/smpl_i64le.h5 b/tables/tests/smpl_i64le.h5 new file mode 100644 index 0000000..c867416 Binary files /dev/null and b/tables/tests/smpl_i64le.h5 differ diff --git a/tables/tests/smpl_unsupptype.h5 b/tables/tests/smpl_unsupptype.h5 new file mode 100644 index 0000000..0ed3901 Binary files /dev/null and b/tables/tests/smpl_unsupptype.h5 differ diff --git a/tables/tests/test_all.py b/tables/tests/test_all.py new file mode 100644 index 0000000..7245d1c --- /dev/null +++ b/tables/tests/test_all.py @@ -0,0 +1,49 @@ +"""Run all test cases.""" + +import sys + +import numpy as np +from packaging.version import Version + +import tables as tb +from tables.tests import common +from tables.tests.test_suite import suite, test + + +def get_tuple_version(hexversion): + """Get a tuple from a compact version in hex.""" + + h = hexversion + return(h & 0xff0000) >> 16, (h & 0xff00) >> 8, h & 0xff + + +if __name__ == '__main__': + + common.parse_argv(sys.argv) + + hdf5_version = get_tuple_version(tb.which_lib_version("hdf5")[0]) + hdf5_version_str = "%s.%s.%s" % hdf5_version + if Version(hdf5_version_str) < tb.req_versions.min_hdf5_version: + print(f"*Warning*: HDF5 version is lower than recommended: " + f"{hdf5_version} < {tb.req_versions.min_hdf5_version}") + + if Version(np.__version__) < tb.req_versions.min_numpy_version: + print(f"*Warning*: NumPy version is lower than recommended: " + f"{np.__version__} < {tb.req_versions.min_numpy_version}") + + # Handle some global flags (i.e. only useful for test_all.py) + only_versions = 0 + args = sys.argv[:] + for arg in args: + # Remove 'show-versions' for PyTables 2.3 or higher + if arg in ['--print-versions', '--show-versions']: + only_versions = True + sys.argv.remove(arg) + elif arg == '--show-memory': + common.show_memory = True + sys.argv.remove(arg) + + common.print_versions() + if not only_versions: + common.print_heavy(common.heavy) + common.unittest.main(defaultTest='tb.tests.suite') diff --git a/tables/tests/test_array.py b/tables/tests/test_array.py new file mode 100644 index 0000000..00e827b --- /dev/null +++ b/tables/tests/test_array.py @@ -0,0 +1,2730 @@ +import sys +import tempfile +from pathlib import Path + +import numpy as np +import tables as tb +from tables.tests import common + + +# warnings.resetwarnings() + + +class BasicTestCase(common.PyTablesTestCase): + """Basic test for all the supported typecodes present in numpy. + + All of them are included on pytables. + + """ + endiancheck = False + + def write_read(self, testarray): + a = testarray + if common.verbose: + print('\n', '-=' * 30) + print("Running test for array with type '%s'" % a.dtype.type, + end=' ') + print("for class check:", self.title) + + # Create an instance of HDF5 file + filename = tempfile.mktemp(".h5") + try: + with tb.open_file(filename, mode="w") as fileh: + root = fileh.root + + # Create the array under root and name 'somearray' + if self.endiancheck and a.dtype.kind != "S": + b = a.byteswap() + b.dtype = a.dtype.newbyteorder() + a = b + + fileh.create_array(root, 'somearray', a, "Some array") + + # Re-open the file in read-only mode + with tb.open_file(filename, mode="r") as fileh: + root = fileh.root + + # Read the saved array + b = root.somearray.read() + + # Compare them. They should be equal. + if common.verbose and not common.allequal(a, b): + print("Write and read arrays differ!") + # print("Array written:", a) + print("Array written shape:", a.shape) + print("Array written itemsize:", a.itemsize) + print("Array written type:", a.dtype.type) + # print("Array read:", b) + print("Array read shape:", b.shape) + print("Array read itemsize:", b.itemsize) + print("Array read type:", b.dtype.type) + if a.dtype.kind != "S": + print("Array written byteorder:", a.dtype.byteorder) + print("Array read byteorder:", b.dtype.byteorder) + + # Check strictly the array equality + self.assertEqual(a.shape, b.shape) + self.assertEqual(a.shape, root.somearray.shape) + if a.dtype.kind == "S": + self.assertEqual(root.somearray.atom.type, "string") + else: + self.assertEqual(a.dtype.type, b.dtype.type) + self.assertEqual(a.dtype.type, + root.somearray.atom.dtype.type) + abo = tb.utils.byteorders[a.dtype.byteorder] + bbo = tb.utils.byteorders[b.dtype.byteorder] + if abo != "irrelevant": + self.assertEqual(abo, root.somearray.byteorder) + self.assertEqual(bbo, sys.byteorder) + if self.endiancheck: + self.assertNotEqual(bbo, abo) + + obj = root.somearray + self.assertEqual(obj.flavor, 'numpy') + self.assertEqual(obj.shape, a.shape) + self.assertEqual(obj.ndim, a.ndim) + self.assertEqual(obj.chunkshape, None) + if a.shape: + nrows = a.shape[0] + else: + # scalar + nrows = 1 + + self.assertEqual(obj.nrows, nrows) + + self.assertTrue(common.allequal(a, b)) + finally: + # Then, delete the file + Path(filename).unlink() + + def write_read_out_arg(self, testarray): + a = testarray + + if common.verbose: + print('\n', '-=' * 30) + print("Running test for array with type '%s'" % a.dtype.type, + end=' ') + print("for class check:", self.title) + + # Create an instance of HDF5 file + filename = tempfile.mktemp(".h5") + try: + with tb.open_file(filename, mode="w") as fileh: + root = fileh.root + + # Create the array under root and name 'somearray' + if self.endiancheck and a.dtype.kind != "S": + b = a.byteswap() + b.dtype = a.dtype.newbyteorder() + a = b + + fileh.create_array(root, 'somearray', a, "Some array") + + # Re-open the file in read-only mode + with tb.open_file(filename, mode="r") as fileh: + root = fileh.root + + # Read the saved array + b = np.empty_like(a, dtype=a.dtype) + root.somearray.read(out=b) + + # Check strictly the array equality + self.assertEqual(a.shape, b.shape) + self.assertEqual(a.shape, root.somearray.shape) + if a.dtype.kind == "S": + self.assertEqual(root.somearray.atom.type, "string") + else: + self.assertEqual(a.dtype.type, b.dtype.type) + self.assertEqual(a.dtype.type, + root.somearray.atom.dtype.type) + abo = tb.utils.byteorders[a.dtype.byteorder] + bbo = tb.utils.byteorders[b.dtype.byteorder] + if abo != "irrelevant": + self.assertEqual(abo, root.somearray.byteorder) + self.assertEqual(abo, bbo) + if self.endiancheck: + self.assertNotEqual(bbo, sys.byteorder) + + self.assertTrue(common.allequal(a, b)) + finally: + # Then, delete the file + Path(filename).unlink() + + def write_read_atom_shape_args(self, testarray): + a = testarray + atom = tb.Atom.from_dtype(a.dtype) + shape = a.shape + byteorder = None + + if common.verbose: + print('\n', '-=' * 30) + print("Running test for array with type '%s'" % a.dtype.type, + end=' ') + print("for class check:", self.title) + + # Create an instance of HDF5 file + filename = tempfile.mktemp(".h5") + try: + with tb.open_file(filename, mode="w") as fileh: + root = fileh.root + + # Create the array under root and name 'somearray' + if self.endiancheck and a.dtype.kind != "S": + b = a.byteswap() + b.dtype = a.dtype.newbyteorder() + if b.dtype.byteorder in ('>', '<'): + byteorder = tb.utils.byteorders[b.dtype.byteorder] + a = b + + ptarr = fileh.create_array(root, 'somearray', + atom=atom, shape=shape, + title="Some array", + # specify the byteorder explicitly + # since there is no way to deduce + # it in this case + byteorder=byteorder) + self.assertEqual(shape, ptarr.shape) + self.assertEqual(atom, ptarr.atom) + ptarr[...] = a + + # Re-open the file in read-only mode + with tb.open_file(filename, mode="r") as fileh: + root = fileh.root + + # Read the saved array + b = root.somearray.read() + + # Compare them. They should be equal. + if common.verbose and not common.allequal(a, b): + print("Write and read arrays differ!") + # print("Array written:", a) + print("Array written shape:", a.shape) + print("Array written itemsize:", a.itemsize) + print("Array written type:", a.dtype.type) + # print("Array read:", b) + print("Array read shape:", b.shape) + print("Array read itemsize:", b.itemsize) + print("Array read type:", b.dtype.type) + if a.dtype.kind != "S": + print("Array written byteorder:", a.dtype.byteorder) + print("Array read byteorder:", b.dtype.byteorder) + + # Check strictly the array equality + self.assertEqual(a.shape, b.shape) + self.assertEqual(a.shape, root.somearray.shape) + if a.dtype.kind == "S": + self.assertEqual(root.somearray.atom.type, "string") + else: + self.assertEqual(a.dtype.type, b.dtype.type) + self.assertEqual(a.dtype.type, + root.somearray.atom.dtype.type) + abo = tb.utils.byteorders[a.dtype.byteorder] + bbo = tb.utils.byteorders[b.dtype.byteorder] + if abo != "irrelevant": + self.assertEqual(abo, root.somearray.byteorder) + self.assertEqual(bbo, sys.byteorder) + if self.endiancheck: + self.assertNotEqual(bbo, abo) + + obj = root.somearray + self.assertEqual(obj.flavor, 'numpy') + self.assertEqual(obj.shape, a.shape) + self.assertEqual(obj.ndim, a.ndim) + self.assertEqual(obj.chunkshape, None) + if a.shape: + nrows = a.shape[0] + else: + # scalar + nrows = 1 + + self.assertEqual(obj.nrows, nrows) + + self.assertTrue(common.allequal(a, b)) + finally: + # Then, delete the file + Path(filename).unlink() + + def setup00_char(self): + """Data integrity during recovery (character objects)""" + + if not isinstance(self.tupleChar, np.ndarray): + a = np.array(self.tupleChar, dtype="S") + else: + a = self.tupleChar + + return a + + def test00_char(self): + a = self.setup00_char() + self.write_read(a) + + def test00_char_out_arg(self): + a = self.setup00_char() + self.write_read_out_arg(a) + + def test00_char_atom_shape_args(self): + a = self.setup00_char() + self.write_read_atom_shape_args(a) + + def test00b_char(self): + """Data integrity during recovery (string objects)""" + + a = self.tupleChar + + filename = tempfile.mktemp(".h5") + try: + # Create an instance of HDF5 file + with tb.open_file(filename, mode="w") as fileh: + fileh.create_array(fileh.root, 'somearray', a, "Some array") + + # Re-open the file in read-only mode + with tb.open_file(filename, mode="r") as fileh: + # Read the saved array + b = fileh.root.somearray.read() + if isinstance(a, bytes): + self.assertEqual(type(b), bytes) + self.assertEqual(a, b) + else: + # If a is not a python string, then it should be a list + # or ndarray + self.assertIn(type(b), [list, np.ndarray]) + finally: + # Then, delete the file + Path(filename).unlink() + + def test00b_char_out_arg(self): + """Data integrity during recovery (string objects)""" + + a = self.tupleChar + + filename = tempfile.mktemp(".h5") + try: + # Create an instance of HDF5 file + with tb.open_file(filename, mode="w") as fileh: + fileh.create_array(fileh.root, 'somearray', a, "Some array") + + # Re-open the file in read-only mode + with tb.open_file(filename, mode="r") as fileh: + # Read the saved array + b = np.empty_like(a) + if fileh.root.somearray.flavor != 'numpy': + self.assertRaises(TypeError, + lambda: fileh.root.somearray.read(out=b)) + else: + fileh.root.somearray.read(out=b) + self.assertIsInstance(b, np.ndarray) + finally: + # Then, delete the file + Path(filename).unlink() + + def test00b_char_atom_shape_args(self): + """Data integrity during recovery (string objects)""" + + a = self.tupleChar + + filename = tempfile.mktemp(".h5") + try: + # Create an instance of HDF5 file + with tb.open_file(filename, mode="w") as fileh: + nparr = np.asarray(a) + atom = tb.Atom.from_dtype(nparr.dtype) + shape = nparr.shape + if nparr.dtype.byteorder in ('>', '<'): + byteorder = tb.utils.byteorders[nparr.dtype.byteorder] + else: + byteorder = None + + ptarr = fileh.create_array(fileh.root, 'somearray', + atom=atom, shape=shape, + byteorder=byteorder, + title="Some array") + self.assertEqual(shape, ptarr.shape) + self.assertEqual(atom, ptarr.atom) + ptarr[...] = a + + # Re-open the file in read-only mode + with tb.open_file(filename, mode="r") as fileh: + # Read the saved array + b = np.empty_like(a) + if fileh.root.somearray.flavor != 'numpy': + self.assertRaises(TypeError, + lambda: fileh.root.somearray.read(out=b)) + else: + fileh.root.somearray.read(out=b) + self.assertIsInstance(b, np.ndarray) + finally: + # Then, delete the file + Path(filename).unlink() + + def setup01_char_nc(self): + """Data integrity during recovery (non-contiguous character objects)""" + + if not isinstance(self.tupleChar, np.ndarray): + a = np.array(self.tupleChar, dtype="S") + else: + a = self.tupleChar + if a.ndim == 0: + b = a.copy() + else: + b = a[::2] + # Ensure that this numpy string is non-contiguous + if len(b) > 1: + self.assertEqual(b.flags.contiguous, False) + return b + + def test01_char_nc(self): + b = self.setup01_char_nc() + self.write_read(b) + + def test01_char_nc_out_arg(self): + b = self.setup01_char_nc() + self.write_read_out_arg(b) + + def test01_char_nc_atom_shape_args(self): + b = self.setup01_char_nc() + self.write_read_atom_shape_args(b) + + def test02_types(self): + """Data integrity during recovery (numerical types)""" + + typecodes = ['int8', 'int16', 'int32', 'int64', + 'uint8', 'uint16', 'uint32', 'uint64', + 'float32', 'float64', + 'complex64', 'complex128'] + + for name in ('float16', 'float96', 'float128', + 'complex192', 'complex256'): + atomname = name.capitalize() + 'Atom' + if hasattr(tb, atomname): + typecodes.append(name) + + for typecode in typecodes: + a = np.array(self.tupleInt, typecode) + self.write_read(a) + b = np.array(self.tupleInt, typecode) + self.write_read_out_arg(b) + c = np.array(self.tupleInt, typecode) + self.write_read_atom_shape_args(c) + + def test03_types_nc(self): + """Data integrity during recovery (non-contiguous numerical types)""" + + typecodes = ['int8', 'int16', 'int32', 'int64', + 'uint8', 'uint16', 'uint32', 'uint64', + 'float32', 'float64', + 'complex64', 'complex128', ] + + for name in ('float16', 'float96', 'float128', + 'complex192', 'complex256'): + atomname = name.capitalize() + 'Atom' + if hasattr(tb, atomname): + typecodes.append(name) + + for typecode in typecodes: + a = np.array(self.tupleInt, typecode) + if a.ndim == 0: + b1 = a.copy() + b2 = a.copy() + b3 = a.copy() + else: + b1 = a[::2] + b2 = a[::2] + b3 = a[::2] + # Ensure that this array is non-contiguous + if len(b1) > 1: + self.assertEqual(b1.flags.contiguous, False) + if len(b2) > 1: + self.assertEqual(b2.flags.contiguous, False) + if len(b3) > 1: + self.assertEqual(b3.flags.contiguous, False) + self.write_read(b1) + self.write_read_out_arg(b2) + self.write_read_atom_shape_args(b3) + + +class Basic0DOneTestCase(BasicTestCase): + # Scalar case + title = "Rank-0 case 1" + tupleInt = 3 + tupleChar = b"3" + endiancheck = True + + +class Basic0DTwoTestCase(BasicTestCase): + # Scalar case + title = "Rank-0 case 2" + tupleInt = 33 + tupleChar = b"33" + endiancheck = True + + +class Basic1DZeroTestCase(BasicTestCase): + # This test case is not supported by PyTables (HDF5 limitations) + # 1D case + title = "Rank-1 case 0" + tupleInt = () + tupleChar = () + endiancheck = False + + +class Basic1DOneTestCase(BasicTestCase): + # 1D case + title = "Rank-1 case 1" + tupleInt = (3,) + tupleChar = (b"a",) + endiancheck = True + + +class Basic1DTwoTestCase(BasicTestCase): + # 1D case + title = "Rank-1 case 2" + tupleInt = (3, 4) + tupleChar = (b"aaa",) + endiancheck = True + + +class Basic1DThreeTestCase(BasicTestCase): + # 1D case + title = "Rank-1 case 3" + tupleInt = (3, 4, 5) + tupleChar = (b"aaa", b"bbb",) + endiancheck = True + + +class Basic2DOneTestCase(BasicTestCase): + # 2D case + title = "Rank-2 case 1" + tupleInt = np.array(np.arange((4)**2)) + tupleInt.shape = (4,)*2 + tupleChar = np.array(["abc"]*3**2, dtype="S3") + tupleChar.shape = (3,)*2 + endiancheck = True + + +class Basic2DTwoTestCase(BasicTestCase): + # 2D case, with a multidimensional dtype + title = "Rank-2 case 2" + tupleInt = np.tile(np.arange(4, dtype=np.int64), [4, 1]) + tupleChar = np.array(["abc"]*3, dtype=("S3", (3,))) + endiancheck = True + + +class Basic10DTestCase(BasicTestCase): + # 10D case + title = "Rank-10 test" + tupleInt = np.array(np.arange((2)**10)) + tupleInt.shape = (2,)*10 + tupleChar = np.array(["abc"]*2**10, dtype="S3") + tupleChar.shape = (2,)*10 + endiancheck = True + + +class Basic32DTestCase(BasicTestCase): + # 32D case (maximum) + title = "Rank-32 test" + tupleInt = np.array((32,)) + tupleInt.shape = (1,)*32 + tupleChar = np.array(["121"], dtype="S3") + tupleChar.shape = (1,)*32 + + +class ReadOutArgumentTests(common.TempFileMixin, common.PyTablesTestCase): + + def setUp(self): + super().setUp() + self.size = 1000 + + def create_array(self): + array = np.arange(self.size, dtype='f8') + disk_array = self.h5file.create_array('/', 'array', array) + return array, disk_array + + def test_read_entire_array(self): + array, disk_array = self.create_array() + out_buffer = np.empty((self.size, ), 'f8') + disk_array.read(out=out_buffer) + np.testing.assert_equal(out_buffer, array) + + def test_read_contiguous_slice1(self): + array, disk_array = self.create_array() + out_buffer = np.arange(self.size, dtype='f8') + out_buffer = np.random.permutation(out_buffer) + out_buffer_orig = out_buffer.copy() + start = self.size // 2 + disk_array.read(start=start, stop=self.size, out=out_buffer[start:]) + np.testing.assert_equal(out_buffer[start:], array[start:]) + np.testing.assert_equal(out_buffer[:start], out_buffer_orig[:start]) + + def test_read_contiguous_slice2(self): + array, disk_array = self.create_array() + out_buffer = np.arange(self.size, dtype='f8') + out_buffer = np.random.permutation(out_buffer) + out_buffer_orig = out_buffer.copy() + start = self.size // 4 + stop = self.size - start + disk_array.read(start=start, stop=stop, out=out_buffer[start:stop]) + np.testing.assert_equal(out_buffer[start:stop], array[start:stop]) + np.testing.assert_equal(out_buffer[:start], out_buffer_orig[:start]) + np.testing.assert_equal(out_buffer[stop:], out_buffer_orig[stop:]) + + def test_read_non_contiguous_slice_contiguous_buffer(self): + array, disk_array = self.create_array() + out_buffer = np.empty((self.size // 2, ), dtype='f8') + disk_array.read(start=0, stop=self.size, step=2, out=out_buffer) + np.testing.assert_equal(out_buffer, array[0:self.size:2]) + + def test_read_non_contiguous_buffer(self): + array, disk_array = self.create_array() + out_buffer = np.empty((self.size, ), 'f8') + out_buffer_slice = out_buffer[0:self.size:2] + + with self.assertRaisesRegex(ValueError, + 'output array not C contiguous'): + disk_array.read(0, self.size, 2, out_buffer_slice) + + def test_buffer_too_small(self): + array, disk_array = self.create_array() + out_buffer = np.empty((self.size // 2, ), 'f8') + self.assertRaises(ValueError, disk_array.read, 0, self.size, 1, + out_buffer) + try: + disk_array.read(0, self.size, 1, out_buffer) + except ValueError as exc: + self.assertIn('output array size invalid, got', str(exc)) + + def test_buffer_too_large(self): + array, disk_array = self.create_array() + out_buffer = np.empty((self.size + 1, ), 'f8') + self.assertRaises(ValueError, disk_array.read, 0, self.size, 1, + out_buffer) + try: + disk_array.read(0, self.size, 1, out_buffer) + except ValueError as exc: + self.assertIn('output array size invalid, got', str(exc)) + + +class SizeOnDiskInMemoryPropertyTestCase(common.TempFileMixin, + common.PyTablesTestCase): + + def setUp(self): + super().setUp() + self.array_size = (10, 10) + self.array = self.h5file.create_array( + '/', 'somearray', np.zeros(self.array_size, 'i4')) + + def test_all_zeros(self): + self.assertEqual(self.array.size_on_disk, 10 * 10 * 4) + self.assertEqual(self.array.size_in_memory, 10 * 10 * 4) + + +class UnalignedAndComplexTestCase(common.TempFileMixin, + common.PyTablesTestCase): + """Basic test for all the supported typecodes present in numpy. + + Most of them are included on PyTables. + + """ + + def setUp(self): + super().setUp() + self.root = self.h5file.root + + def write_read(self, testArray): + if common.verbose: + print('\n', '-=' * 30) + print("\nRunning test for array with type '%s'" % + testArray.dtype.type) + + # Create the array under root and name 'somearray' + a = testArray + if self.endiancheck: + byteorder = {"little": "big", "big": "little"}[sys.byteorder] + else: + byteorder = sys.byteorder + + self.h5file.create_array(self.root, 'somearray', a, "Some array", + byteorder=byteorder) + + if self.reopen: + self._reopen() + self.root = self.h5file.root + + # Read the saved array + b = self.root.somearray.read() + + # Get an array to be compared in the correct byteorder + c = a.newbyteorder(byteorder) + + # Compare them. They should be equal. + if not common.allequal(c, b) and common.verbose: + print("Write and read arrays differ!") + print("Array written:", a) + print("Array written shape:", a.shape) + print("Array written itemsize:", a.itemsize) + print("Array written type:", a.dtype.type) + print("Array read:", b) + print("Array read shape:", b.shape) + print("Array read itemsize:", b.itemsize) + print("Array read type:", b.dtype.type) + + # Check strictly the array equality + self.assertEqual(a.shape, b.shape) + self.assertEqual(a.shape, self.root.somearray.shape) + if a.dtype.byteorder != "|": + self.assertEqual(a.dtype, b.dtype) + self.assertEqual(a.dtype, self.root.somearray.atom.dtype) + self.assertEqual(tb.utils.byteorders[b.dtype.byteorder], + sys.byteorder) + self.assertEqual(self.root.somearray.byteorder, byteorder) + + self.assertTrue(common.allequal(c, b)) + + def test01_signedShort_unaligned(self): + """Checking an unaligned signed short integer array""" + + r = np.rec.array(b'a'*200, formats='i1,f4,i2', shape=10) + a = r["f2"] + # Ensure that this array is non-aligned + self.assertEqual(a.flags.aligned, False) + self.assertEqual(a.dtype.type, np.int16) + self.write_read(a) + + def test02_float_unaligned(self): + """Checking an unaligned single precision array""" + + r = np.rec.array(b'a'*200, formats='i1,f4,i2', shape=10) + a = r["f1"] + # Ensure that this array is non-aligned + self.assertEqual(a.flags.aligned, 0) + self.assertEqual(a.dtype.type, np.float32) + self.write_read(a) + + def test03_byte_offset(self): + """Checking an offsetted byte array""" + + r = np.arange(100, dtype=np.int8) + r.shape = (10, 10) + a = r[2] + self.write_read(a) + + def test04_short_offset(self): + """Checking an offsetted unsigned short int precision array""" + + r = np.arange(100, dtype=np.uint32) + r.shape = (10, 10) + a = r[2] + self.write_read(a) + + def test05_int_offset(self): + """Checking an offsetted integer array""" + + r = np.arange(100, dtype=np.int32) + r.shape = (10, 10) + a = r[2] + self.write_read(a) + + def test06_longlongint_offset(self): + """Checking an offsetted long long integer array""" + + r = np.arange(100, dtype=np.int64) + r.shape = (10, 10) + a = r[2] + self.write_read(a) + + def test07_float_offset(self): + """Checking an offsetted single precision array""" + + r = np.arange(100, dtype=np.float32) + r.shape = (10, 10) + a = r[2] + self.write_read(a) + + def test08_double_offset(self): + """Checking an offsetted double precision array""" + + r = np.arange(100, dtype=np.float64) + r.shape = (10, 10) + a = r[2] + self.write_read(a) + + def test09_float_offset_unaligned(self): + """Checking an unaligned and offsetted single precision array""" + + r = np.rec.array(b'a'*200, formats='i1,3f4,i2', shape=10) + a = r["f1"][3] + # Ensure that this array is non-aligned + self.assertEqual(a.flags.aligned, False) + self.assertEqual(a.dtype.type, np.float32) + self.write_read(a) + + def test10_double_offset_unaligned(self): + """Checking an unaligned and offsetted double precision array""" + + r = np.rec.array(b'a'*400, formats='i1,3f8,i2', shape=10) + a = r["f1"][3] + # Ensure that this array is non-aligned + self.assertEqual(a.flags.aligned, False) + self.assertEqual(a.dtype.type, np.float64) + self.write_read(a) + + def test11_int_byteorder(self): + """Checking setting data with different byteorder in a range + (integer)""" + + # Save an array with the reversed byteorder on it + a = np.arange(25, dtype=np.int32).reshape(5, 5) + a = a.byteswap() + a = a.newbyteorder() + array = self.h5file.create_array( + self.h5file.root, 'array', a, "byteorder (int)") + # Read a subarray (got an array with the machine byteorder) + b = array[2:4, 3:5] + b = b.byteswap() + b = b.newbyteorder() + # Set this subarray back to the array + array[2:4, 3:5] = b + b = b.byteswap() + b = b.newbyteorder() + # Set this subarray back to the array + array[2:4, 3:5] = b + # Check that the array is back in the correct byteorder + c = array[...] + if common.verbose: + print("byteorder of array on disk-->", array.byteorder) + print("byteorder of subarray-->", b.dtype.byteorder) + print("subarray-->", b) + print("retrieved array-->", c) + self.assertTrue(common.allequal(a, c)) + + def test12_float_byteorder(self): + """Checking setting data with different byteorder in a range (float)""" + + # Save an array with the reversed byteorder on it + a = np.arange(25, dtype=np.float64).reshape(5, 5) + a = a.byteswap() + a = a.newbyteorder() + array = self.h5file.create_array( + self.h5file.root, 'array', a, "byteorder (float)") + # Read a subarray (got an array with the machine byteorder) + b = array[2:4, 3:5] + b = b.byteswap() + b = b.newbyteorder() + # Set this subarray back to the array + array[2:4, 3:5] = b + b = b.byteswap() + b = b.newbyteorder() + # Set this subarray back to the array + array[2:4, 3:5] = b + # Check that the array is back in the correct byteorder + c = array[...] + if common.verbose: + print("byteorder of array on disk-->", array.byteorder) + print("byteorder of subarray-->", b.dtype.byteorder) + print("subarray-->", b) + print("retrieved array-->", c) + self.assertTrue(common.allequal(a, c)) + + +class ComplexNotReopenNotEndianTestCase(UnalignedAndComplexTestCase): + endiancheck = False + reopen = False + + +class ComplexReopenNotEndianTestCase(UnalignedAndComplexTestCase): + endiancheck = False + reopen = True + + +class ComplexNotReopenEndianTestCase(UnalignedAndComplexTestCase): + endiancheck = True + reopen = False + + +class ComplexReopenEndianTestCase(UnalignedAndComplexTestCase): + endiancheck = True + reopen = True + + +class GroupsArrayTestCase(common.TempFileMixin, common.PyTablesTestCase): + """This test class checks combinations of arrays with groups.""" + + def test00_iterativeGroups(self): + """Checking combinations of arrays with groups.""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test00_iterativeGroups..." % + self.__class__.__name__) + + # Get the root group + group = self.h5file.root + + # Set the type codes to test + # The typecodes below does expose an ambiguity that is reported in: + # http://projects.scipy.org/scipy/numpy/ticket/283 and + # http://projects.scipy.org/scipy/numpy/ticket/290 + typecodes = ['b', 'B', 'h', 'H', 'i', 'I', 'l', 'L', 'q', 'f', 'd', + 'F', 'D'] + if hasattr(tb, 'Float16Atom'): + typecodes.append('e') + if hasattr(tb, 'Float96Atom') or hasattr(tb, 'Float128Atom'): + typecodes.append('g') + if (hasattr(tb, 'Complex192Atom') or + hasattr(tb, 'Complex256Atom')): + typecodes.append('G') + + for i, typecode in enumerate(typecodes): + a = np.ones((3,), typecode) + dsetname = 'array_' + typecode + if common.verbose: + print("Creating dataset:", group._g_join(dsetname)) + self.h5file.create_array(group, dsetname, a, "Large array") + group = self.h5file.create_group(group, 'group' + str(i)) + + # Reopen the file + self._reopen() + + # Get the root group + group = self.h5file.root + + # Get the metadata on the previosly saved arrays + for i, typecode in enumerate(typecodes): + # Create an array for later comparison + a = np.ones((3,), typecode) + # Get the dset object hanging from group + dset = getattr(group, 'array_' + typecode) + # Get the actual array + b = dset.read() + if common.verbose: + print("Info from dataset:", dset._v_pathname) + print(" shape ==>", dset.shape, end=' ') + print(" type ==> %s" % dset.atom.dtype) + print("Array b read from file. Shape: ==>", b.shape, end=' ') + print(". Type ==> %s" % b.dtype) + self.assertEqual(a.shape, b.shape) + self.assertEqual(a.dtype, b.dtype) + self.assertTrue(common.allequal(a, b)) + + # Iterate over the next group + group = getattr(group, 'group' + str(i)) + + def test01_largeRankArrays(self): + """Checking creation of large rank arrays (0 < rank <= 32) + It also uses arrays ranks which ranges until maxrank. + """ + + # maximum level of recursivity (deepest group level) achieved: + # maxrank = 32 (for a effective maximum rank of 32) + # This limit is due to HDF5 library limitations. + minrank = 1 + maxrank = 32 + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test01_largeRankArrays..." % + self.__class__.__name__) + print("Maximum rank for tested arrays:", maxrank) + + group = self.h5file.root + if common.verbose: + print("Rank array writing progress: ", end=' ') + for rank in range(minrank, maxrank + 1): + # Create an array of integers, with incrementally bigger ranges + a = np.ones((1,) * rank, np.int32) + if common.verbose: + print("%3d," % (rank), end=' ') + self.h5file.create_array(group, "array", a, "Rank: %s" % rank) + group = self.h5file.create_group(group, 'group' + str(rank)) + + # Reopen the file + self._reopen() + + group = self.h5file.root + if common.verbose: + print() + print("Rank array reading progress: ") + # Get the metadata on the previosly saved arrays + for rank in range(minrank, maxrank + 1): + # Create an array for later comparison + a = np.ones((1,) * rank, np.int32) + # Get the actual array + b = group.array.read() + if common.verbose: + print("%3d," % (rank), end=' ') + if common.verbose and not common.allequal(a, b): + print("Info from dataset:", group.array._v_pathname) + print(" Shape: ==>", group.array.shape, end=' ') + print(" typecode ==> %c" % group.array.typecode) + print("Array b read from file. Shape: ==>", b.shape, end=' ') + print(". Type ==> %c" % b.dtype) + + self.assertEqual(a.shape, b.shape) + self.assertEqual(a.dtype, b.dtype) + self.assertTrue(common.allequal(a, b)) + + # print(self.h5file) + # Iterate over the next group + group = self.h5file.get_node(group, 'group' + str(rank)) + + if common.verbose: + print() # This flush the stdout buffer + + +class CopyTestCase(common.TempFileMixin, common.PyTablesTestCase): + + def test01_copy(self): + """Checking Array.copy() method.""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test01_copy..." % self.__class__.__name__) + + # Create an Array + arr = np.array([[456, 2], [3, 457]], dtype='int16') + array1 = self.h5file.create_array( + self.h5file.root, 'array1', arr, "title array1") + + # Copy to another Array + array2 = array1.copy('/', 'array2') + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen() + array1 = self.h5file.root.array1 + array2 = self.h5file.root.array2 + + if common.verbose: + print("array1-->", array1.read()) + print("array2-->", array2.read()) + # print("dirs-->", dir(array1), dir(array2)) + print("attrs array1-->", repr(array1.attrs)) + print("attrs array2-->", repr(array2.attrs)) + + # Check that all the elements are equal + self.assertTrue(common.allequal(array1.read(), array2.read())) + + # Assert other properties in array + self.assertEqual(array1.nrows, array2.nrows) + self.assertEqual(array1.flavor, array2.flavor) + self.assertEqual(array1.atom.dtype, array2.atom.dtype) + self.assertEqual(array1.title, array2.title) + + def test02_copy(self): + """Checking Array.copy() method (where specified)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test02_copy..." % self.__class__.__name__) + + # Create an Array + arr = np.array([[456, 2], [3, 457]], dtype='int16') + array1 = self.h5file.create_array( + self.h5file.root, 'array1', arr, "title array1") + + # Copy to another Array + group1 = self.h5file.create_group("/", "group1") + array2 = array1.copy(group1, 'array2') + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen() + array1 = self.h5file.root.array1 + array2 = self.h5file.root.group1.array2 + + if common.verbose: + print("array1-->", array1.read()) + print("array2-->", array2.read()) + # print("dirs-->", dir(array1), dir(array2)) + print("attrs array1-->", repr(array1.attrs)) + print("attrs array2-->", repr(array2.attrs)) + + # Check that all the elements are equal + self.assertTrue(common.allequal(array1.read(), array2.read())) + + # Assert other properties in array + self.assertEqual(array1.nrows, array2.nrows) + self.assertEqual(array1.flavor, array2.flavor) + self.assertEqual(array1.atom.dtype, array2.atom.dtype) + self.assertEqual(array1.title, array2.title) + + def test03_copy(self): + """Checking Array.copy() method (checking title copying)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test04_copy..." % self.__class__.__name__) + + # Create an Array + arr = np.array([[456, 2], [3, 457]], dtype='int16') + array1 = self.h5file.create_array( + self.h5file.root, 'array1', arr, "title array1") + # Append some user attrs + array1.attrs.attr1 = "attr1" + array1.attrs.attr2 = 2 + # Copy it to another Array + array2 = array1.copy('/', 'array2', title="title array2") + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen() + array1 = self.h5file.root.array1 + array2 = self.h5file.root.array2 + + # Assert user attributes + if common.verbose: + print("title of destination array-->", array2.title) + self.assertEqual(array2.title, "title array2") + + def test04_copy(self): + """Checking Array.copy() method (user attributes copied)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test05_copy..." % self.__class__.__name__) + + # Create an Array + arr = np.array([[456, 2], [3, 457]], dtype='int16') + array1 = self.h5file.create_array( + self.h5file.root, 'array1', arr, "title array1") + # Append some user attrs + array1.attrs.attr1 = "attr1" + array1.attrs.attr2 = 2 + # Copy it to another Array + array2 = array1.copy('/', 'array2', copyuserattrs=1) + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen() + array1 = self.h5file.root.array1 + array2 = self.h5file.root.array2 + + if common.verbose: + print("attrs array1-->", repr(array1.attrs)) + print("attrs array2-->", repr(array2.attrs)) + + # Assert user attributes + self.assertEqual(array2.attrs.attr1, "attr1") + self.assertEqual(array2.attrs.attr2, 2) + + def test04b_copy(self): + """Checking Array.copy() method (user attributes not copied)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test05b_copy..." % self.__class__.__name__) + + # Create an Array + arr = np.array([[456, 2], [3, 457]], dtype='int16') + array1 = self.h5file.create_array( + self.h5file.root, 'array1', arr, "title array1") + # Append some user attrs + array1.attrs.attr1 = "attr1" + array1.attrs.attr2 = 2 + # Copy it to another Array + array2 = array1.copy('/', 'array2', copyuserattrs=0) + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen() + array1 = self.h5file.root.array1 + array2 = self.h5file.root.array2 + + if common.verbose: + print("attrs array1-->", repr(array1.attrs)) + print("attrs array2-->", repr(array2.attrs)) + + # Assert user attributes + self.assertEqual(hasattr(array2.attrs, "attr1"), 0) + self.assertEqual(hasattr(array2.attrs, "attr2"), 0) + + +class CloseCopyTestCase(CopyTestCase): + close = 1 + + +class OpenCopyTestCase(CopyTestCase): + close = 0 + + +class CopyIndexTestCase(common.TempFileMixin, common.PyTablesTestCase): + + def test01_index(self): + """Checking Array.copy() method with indexes.""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test01_index..." % self.__class__.__name__) + + # Create a numpy + r = np.arange(200, dtype='int32') + r.shape = (100, 2) + # Save it in a array: + array1 = self.h5file.create_array( + self.h5file.root, 'array1', r, "title array1") + + # Copy to another array + array2 = array1.copy("/", 'array2', + start=self.start, + stop=self.stop, + step=self.step) + if common.verbose: + print("array1-->", array1.read()) + print("array2-->", array2.read()) + print("attrs array1-->", repr(array1.attrs)) + print("attrs array2-->", repr(array2.attrs)) + + # Check that all the elements are equal + r2 = r[self.start:self.stop:self.step] + self.assertTrue(common.allequal(r2, array2.read())) + + # Assert the number of rows in array + if common.verbose: + print("nrows in array2-->", array2.nrows) + print("and it should be-->", r2.shape[0]) + self.assertEqual(r2.shape[0], array2.nrows) + + def test02_indexclosef(self): + """Checking Array.copy() method with indexes (close file version)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test02_indexclosef..." % self.__class__.__name__) + + # Create a numpy + r = np.arange(200, dtype='int32') + r.shape = (100, 2) + # Save it in a array: + array1 = self.h5file.create_array( + self.h5file.root, 'array1', r, "title array1") + + # Copy to another array + array2 = array1.copy("/", 'array2', + start=self.start, + stop=self.stop, + step=self.step) + # Close and reopen the file + self._reopen() + array1 = self.h5file.root.array1 + array2 = self.h5file.root.array2 + + if common.verbose: + print("array1-->", array1.read()) + print("array2-->", array2.read()) + print("attrs array1-->", repr(array1.attrs)) + print("attrs array2-->", repr(array2.attrs)) + + # Check that all the elements are equal + r2 = r[self.start:self.stop:self.step] + self.assertTrue(common.allequal(r2, array2.read())) + + # Assert the number of rows in array + if common.verbose: + print("nrows in array2-->", array2.nrows) + print("and it should be-->", r2.shape[0]) + self.assertEqual(r2.shape[0], array2.nrows) + + +class CopyIndex1TestCase(CopyIndexTestCase): + start = 0 + stop = 7 + step = 1 + + +class CopyIndex2TestCase(CopyIndexTestCase): + start = 0 + stop = -1 + step = 1 + + +class CopyIndex3TestCase(CopyIndexTestCase): + start = 1 + stop = 7 + step = 1 + + +class CopyIndex4TestCase(CopyIndexTestCase): + start = 0 + stop = 6 + step = 1 + + +class CopyIndex5TestCase(CopyIndexTestCase): + start = 3 + stop = 7 + step = 1 + + +class CopyIndex6TestCase(CopyIndexTestCase): + start = 3 + stop = 6 + step = 2 + + +class CopyIndex7TestCase(CopyIndexTestCase): + start = 0 + stop = 7 + step = 10 + + +class CopyIndex8TestCase(CopyIndexTestCase): + start = 6 + stop = -1 # Negative values means starting from the end + step = 1 + + +class CopyIndex9TestCase(CopyIndexTestCase): + start = 3 + stop = 4 + step = 1 + + +class CopyIndex10TestCase(CopyIndexTestCase): + start = 3 + stop = 4 + step = 2 + + +class CopyIndex11TestCase(CopyIndexTestCase): + start = -3 + stop = -1 + step = 2 + + +class CopyIndex12TestCase(CopyIndexTestCase): + start = -1 # Should point to the last element + stop = None # None should mean the last element (including it) + step = 1 + + +class GetItemTestCase(common.TempFileMixin, common.PyTablesTestCase): + + def test00_single(self): + """Single element access (character types)""" + + # Create the array under root and name 'somearray' + a = self.charList + arr = self.h5file.create_array( + self.h5file.root, 'somearray', a, "Some array") + + if self.close: + self._reopen() + arr = self.h5file.root.somearray + + # Get and compare an element + if common.verbose: + print("Original first element:", a[0], type(a[0])) + print("Read first element:", arr[0], type(arr[0])) + self.assertTrue(common.allequal(a[0], arr[0])) + self.assertEqual(type(a[0]), type(arr[0])) + + def test01_single(self): + """Single element access (numerical types)""" + + # Create the array under root and name 'somearray' + a = self.numericalList + arr = self.h5file.create_array( + self.h5file.root, 'somearray', a, "Some array") + + if self.close: + self._reopen() + arr = self.h5file.root.somearray + + # Get and compare an element + if common.verbose: + print("Original first element:", a[0], type(a[0])) + print("Read first element:", arr[0], type(arr[0])) + self.assertEqual(a[0], arr[0]) + self.assertEqual(type(a[0]), type(arr[0])) + + def test02_range(self): + """Range element access (character types)""" + + # Create the array under root and name 'somearray' + a = self.charListME + arr = self.h5file.create_array( + self.h5file.root, 'somearray', a, "Some array") + + if self.close: + self._reopen() + arr = self.h5file.root.somearray + + # Get and compare an element + if common.verbose: + print("Original elements:", a[1:4]) + print("Read elements:", arr[1:4]) + self.assertTrue(common.allequal(a[1:4], arr[1:4])) + + def test03_range(self): + """Range element access (numerical types)""" + + # Create the array under root and name 'somearray' + a = self.numericalListME + arr = self.h5file.create_array( + self.h5file.root, 'somearray', a, "Some array") + + if self.close: + self._reopen() + arr = self.h5file.root.somearray + + # Get and compare an element + if common.verbose: + print("Original elements:", a[1:4]) + print("Read elements:", arr[1:4]) + self.assertTrue(common.allequal(a[1:4], arr[1:4])) + + def test04_range(self): + """Range element access, strided (character types)""" + + # Create the array under root and name 'somearray' + a = self.charListME + arr = self.h5file.create_array( + self.h5file.root, 'somearray', a, "Some array") + + if self.close: + self._reopen() + arr = self.h5file.root.somearray + + # Get and compare an element + if common.verbose: + print("Original elements:", a[1:4:2]) + print("Read elements:", arr[1:4:2]) + self.assertTrue(common.allequal(a[1:4:2], arr[1:4:2])) + + def test05_range(self): + """Range element access, strided (numerical types)""" + + # Create the array under root and name 'somearray' + a = self.numericalListME + arr = self.h5file.create_array( + self.h5file.root, 'somearray', a, "Some array") + + if self.close: + self._reopen() + arr = self.h5file.root.somearray + + # Get and compare an element + if common.verbose: + print("Original elements:", a[1:4:2]) + print("Read elements:", arr[1:4:2]) + self.assertTrue(common.allequal(a[1:4:2], arr[1:4:2])) + + def test06_negativeIndex(self): + """Negative Index element access (character types)""" + + # Create the array under root and name 'somearray' + a = self.charListME + arr = self.h5file.create_array( + self.h5file.root, 'somearray', a, "Some array") + + if self.close: + self._reopen() + arr = self.h5file.root.somearray + + # Get and compare an element + if common.verbose: + print("Original last element:", a[-1]) + print("Read last element:", arr[-1]) + self.assertTrue(common.allequal(a[-1], arr[-1])) + + def test07_negativeIndex(self): + """Negative Index element access (numerical types)""" + + # Create the array under root and name 'somearray' + a = self.numericalListME + arr = self.h5file.create_array( + self.h5file.root, 'somearray', a, "Some array") + + if self.close: + self._reopen() + arr = self.h5file.root.somearray + + # Get and compare an element + if common.verbose: + print("Original before last element:", a[-2]) + print("Read before last element:", arr[-2]) + if isinstance(a[-2], np.ndarray): + self.assertTrue(common.allequal(a[-2], arr[-2])) + else: + self.assertEqual(a[-2], arr[-2]) + + def test08_negativeRange(self): + """Negative range element access (character types)""" + + # Create the array under root and name 'somearray' + a = self.charListME + arr = self.h5file.create_array( + self.h5file.root, 'somearray', a, "Some array") + + if self.close: + self._reopen() + arr = self.h5file.root.somearray + + # Get and compare an element + if common.verbose: + print("Original last elements:", a[-4:-1]) + print("Read last elements:", arr[-4:-1]) + self.assertTrue(common.allequal(a[-4:-1], arr[-4:-1])) + + def test09_negativeRange(self): + """Negative range element access (numerical types)""" + + # Create the array under root and name 'somearray' + a = self.numericalListME + arr = self.h5file.create_array( + self.h5file.root, 'somearray', a, "Some array") + + if self.close: + self._reopen() + arr = self.h5file.root.somearray + + # Get and compare an element + if common.verbose: + print("Original last elements:", a[-4:-1]) + print("Read last elements:", arr[-4:-1]) + self.assertTrue(common.allequal(a[-4:-1], arr[-4:-1])) + + +class GI1NATestCase(GetItemTestCase, common.PyTablesTestCase): + title = "Rank-1 case 1" + numericalList = np.array([3]) + numericalListME = np.array([3, 2, 1, 0, 4, 5, 6]) + charList = np.array(["3"], 'S') + charListME = np.array( + ["321", "221", "121", "021", "421", "521", "621"], 'S') + + +class GI1NAOpenTestCase(GI1NATestCase): + close = 0 + + +class GI1NACloseTestCase(GI1NATestCase): + close = 1 + + +class GI2NATestCase(GetItemTestCase): + # A more complex example + title = "Rank-1,2 case 2" + numericalList = np.array([3, 4]) + numericalListME = np.array([[3, 2, 1, 0, 4, 5, 6], + [2, 1, 0, 4, 5, 6, 7], + [4, 3, 2, 1, 0, 4, 5], + [3, 2, 1, 0, 4, 5, 6], + [3, 2, 1, 0, 4, 5, 6]]) + + charList = np.array(["a", "b"], 'S') + charListME = np.array( + [["321", "221", "121", "021", "421", "521", "621"], + ["21", "21", "11", "02", "42", "21", "61"], + ["31", "21", "12", "21", "41", "51", "621"], + ["321", "221", "121", "021", + "421", "521", "621"], + ["3241", "2321", "13216", + "0621", "4421", "5421", "a621"], + ["a321", "s221", "d121", "g021", "b421", "5vvv21", "6zxzxs21"]], 'S') + + +class GI2NAOpenTestCase(GI2NATestCase): + close = 0 + + +class GI2NACloseTestCase(GI2NATestCase): + close = 1 + + +class SetItemTestCase(common.TempFileMixin, common.PyTablesTestCase): + + def test00_single(self): + """Single element update (character types)""" + + # Create the array under root and name 'somearray' + a = self.charList + arr = self.h5file.create_array( + self.h5file.root, 'somearray', a, "Some array") + + if self.close: + self._reopen('a') + arr = self.h5file.root.somearray + + # Modify a single element of a and arr: + a[0] = b"b" + arr[0] = b"b" + + # Get and compare an element + if common.verbose: + print("Original first element:", a[0]) + print("Read first element:", arr[0]) + self.assertTrue(common.allequal(a[0], arr[0])) + + def test01_single(self): + """Single element update (numerical types)""" + + # Create the array under root and name 'somearray' + a = self.numericalList + arr = self.h5file.create_array( + self.h5file.root, 'somearray', a, "Some array") + + if self.close: + self._reopen('a') + arr = self.h5file.root.somearray + + # Modify elements of a and arr: + a[0] = 333 + arr[0] = 333 + + # Get and compare an element + if common.verbose: + print("Original first element:", a[0]) + print("Read first element:", arr[0]) + self.assertEqual(a[0], arr[0]) + + def test02_range(self): + """Range element update (character types)""" + + # Create the array under root and name 'somearray' + a = self.charListME + arr = self.h5file.create_array( + self.h5file.root, 'somearray', a, "Some array") + + if self.close: + self._reopen('a') + arr = self.h5file.root.somearray + + # Modify elements of a and arr: + a[1:3] = b"xXx" + arr[1:3] = b"xXx" + + # Get and compare an element + if common.verbose: + print("Original elements:", a[1:4]) + print("Read elements:", arr[1:4]) + self.assertTrue(common.allequal(a[1:4], arr[1:4])) + + def test03_range(self): + """Range element update (numerical types)""" + + # Create the array under root and name 'somearray' + a = self.numericalListME + arr = self.h5file.create_array( + self.h5file.root, 'somearray', a, "Some array") + + if self.close: + self._reopen('a') + arr = self.h5file.root.somearray + + # Modify elements of a and arr: + s = slice(1, 3, None) + rng = np.arange(a[s].size)*2 + 3 + rng.shape = a[s].shape + a[s] = rng + arr[s] = rng + + # Get and compare an element + if common.verbose: + print("Original elements:", a[1:4]) + print("Read elements:", arr[1:4]) + self.assertTrue(common.allequal(a[1:4], arr[1:4])) + + def test04_range(self): + """Range element update, strided (character types)""" + + # Create the array under root and name 'somearray' + a = self.charListME + arr = self.h5file.create_array( + self.h5file.root, 'somearray', a, "Some array") + + if self.close: + self._reopen('a') + arr = self.h5file.root.somearray + + # Modify elements of a and arr: + s = slice(1, 4, 2) + a[s] = b"xXx" + arr[s] = b"xXx" + + # Get and compare an element + if common.verbose: + print("Original elements:", a[1:4:2]) + print("Read elements:", arr[1:4:2]) + self.assertTrue(common.allequal(a[1:4:2], arr[1:4:2])) + + def test05_range(self): + """Range element update, strided (numerical types)""" + + # Create the array under root and name 'somearray' + a = self.numericalListME + arr = self.h5file.create_array( + self.h5file.root, 'somearray', a, "Some array") + + if self.close: + self._reopen('a') + arr = self.h5file.root.somearray + + # Modify elements of a and arr: + s = slice(1, 4, 2) + rng = np.arange(a[s].size)*2 + 3 + rng.shape = a[s].shape + a[s] = rng + arr[s] = rng + + # Get and compare an element + if common.verbose: + print("Original elements:", a[1:4:2]) + print("Read elements:", arr[1:4:2]) + self.assertTrue(common.allequal(a[1:4:2], arr[1:4:2])) + + def test06_negativeIndex(self): + """Negative Index element update (character types)""" + + # Create the array under root and name 'somearray' + a = self.charListME + arr = self.h5file.create_array( + self.h5file.root, 'somearray', a, "Some array") + + if self.close: + self._reopen('a') + arr = self.h5file.root.somearray + + # Modify elements of a and arr: + s = -1 + a[s] = b"xXx" + arr[s] = b"xXx" + + # Get and compare an element + if common.verbose: + print("Original last element:", a[-1]) + print("Read last element:", arr[-1]) + self.assertTrue(common.allequal(a[-1], arr[-1])) + + def test07_negativeIndex(self): + """Negative Index element update (numerical types)""" + + # Create the array under root and name 'somearray' + a = self.numericalListME + arr = self.h5file.create_array( + self.h5file.root, 'somearray', a, "Some array") + + if self.close: + self._reopen('a') + arr = self.h5file.root.somearray + + # Modify elements of a and arr: + s = -2 + a[s] = a[s]*2 + 3 + arr[s] = arr[s]*2 + 3 + + # Get and compare an element + if common.verbose: + print("Original before last element:", a[-2]) + print("Read before last element:", arr[-2]) + if isinstance(a[-2], np.ndarray): + self.assertTrue(common.allequal(a[-2], arr[-2])) + else: + self.assertEqual(a[-2], arr[-2]) + + def test08_negativeRange(self): + """Negative range element update (character types)""" + + # Create the array under root and name 'somearray' + a = self.charListME + arr = self.h5file.create_array( + self.h5file.root, 'somearray', a, "Some array") + + if self.close: + self._reopen('a') + arr = self.h5file.root.somearray + + # Modify elements of a and arr: + s = slice(-4, -1, None) + a[s] = b"xXx" + arr[s] = b"xXx" + + # Get and compare an element + if common.verbose: + print("Original last elements:", a[-4:-1]) + print("Read last elements:", arr[-4:-1]) + self.assertTrue(common.allequal(a[-4:-1], arr[-4:-1])) + + def test09_negativeRange(self): + """Negative range element update (numerical types)""" + + # Create the array under root and name 'somearray' + a = self.numericalListME + arr = self.h5file.create_array( + self.h5file.root, 'somearray', a, "Some array") + + if self.close: + self._reopen('a') + arr = self.h5file.root.somearray + + # Modify elements of a and arr: + s = slice(-3, -1, None) + rng = np.arange(a[s].size)*2 + 3 + rng.shape = a[s].shape + a[s] = rng + arr[s] = rng + + # Get and compare an element + if common.verbose: + print("Original last elements:", a[-4:-1]) + print("Read last elements:", arr[-4:-1]) + self.assertTrue(common.allequal(a[-4:-1], arr[-4:-1])) + + def test10_outOfRange(self): + """Out of range update (numerical types)""" + + # Create the array under root and name 'somearray' + a = self.numericalListME + arr = self.h5file.create_array( + self.h5file.root, 'somearray', a, "Some array") + + if self.close: + self._reopen('a') + arr = self.h5file.root.somearray + + # Modify elements of arr that are out of range: + s = slice(1, a.shape[0]+1, None) + s2 = slice(1, 1000, None) + rng = np.arange(a[s].size)*2 + 3 + rng.shape = a[s].shape + a[s] = rng + rng2 = np.arange(a[s2].size)*2 + 3 + rng2.shape = a[s2].shape + arr[s2] = rng2 + + # Get and compare an element + if common.verbose: + print("Original last elements:", a[-4:-1]) + print("Read last elements:", arr[-4:-1]) + self.assertTrue(common.allequal(a[-4:-1], arr[-4:-1])) + + +class SI1NATestCase(SetItemTestCase, common.PyTablesTestCase): + title = "Rank-1 case 1" + numericalList = np.array([3]) + numericalListME = np.array([3, 2, 1, 0, 4, 5, 6]) + charList = np.array(["3"], 'S') + charListME = np.array( + ["321", "221", "121", "021", "421", "521", "621"], 'S') + + +class SI1NAOpenTestCase(SI1NATestCase): + close = 0 + + +class SI1NACloseTestCase(SI1NATestCase): + close = 1 + + +class SI2NATestCase(SetItemTestCase): + # A more complex example + title = "Rank-1,2 case 2" + numericalList = np.array([3, 4]) + numericalListME = np.array([[3, 2, 1, 0, 4, 5, 6], + [2, 1, 0, 4, 5, 6, 7], + [4, 3, 2, 1, 0, 4, 5], + [3, 2, 1, 0, 4, 5, 6], + [3, 2, 1, 0, 4, 5, 6]]) + + charList = np.array(["a", "b"], 'S') + charListME = np.array( + [["321", "221", "121", "021", "421", "521", "621"], + ["21", "21", "11", "02", "42", "21", "61"], + ["31", "21", "12", "21", "41", "51", "621"], + ["321", "221", "121", "021", + "421", "521", "621"], + ["3241", "2321", "13216", + "0621", "4421", "5421", "a621"], + ["a321", "s221", "d121", "g021", "b421", "5vvv21", "6zxzxs21"]], 'S') + + +class SI2NAOpenTestCase(SI2NATestCase): + close = 0 + + +class SI2NACloseTestCase(SI2NATestCase): + close = 1 + + +class GeneratorTestCase(common.TempFileMixin, common.PyTablesTestCase): + + def test00a_single(self): + """Testing generator access to Arrays, single elements (char)""" + + # Create the array under root and name 'somearray' + a = self.charList + arr = self.h5file.create_array( + self.h5file.root, 'somearray', a, "Some array") + + if self.close: + self._reopen() + arr = self.h5file.root.somearray + + # Get and compare an element + ga = [i for i in a] + garr = [i for i in arr] + if common.verbose: + print("Result of original iterator:", ga) + print("Result of read generator:", garr) + self.assertEqual(ga, garr) + + def test00b_me(self): + """Testing generator access to Arrays, multiple elements (char)""" + + # Create the array under root and name 'somearray' + a = self.charListME + arr = self.h5file.create_array( + self.h5file.root, 'somearray', a, "Some array") + + if self.close: + self._reopen() + arr = self.h5file.root.somearray + + # Get and compare an element + ga = list(a) + garr = list(arr) + + if common.verbose: + print("Result of original iterator:", ga) + print("Result of read generator:", garr) + for x_ga, x_garr in zip(ga, garr): + self.assertTrue(common.allequal(x_ga, x_garr)) + + def test01a_single(self): + """Testing generator access to Arrays, single elements (numeric)""" + + # Create the array under root and name 'somearray' + a = self.numericalList + arr = self.h5file.create_array( + self.h5file.root, 'somearray', a, "Some array") + + if self.close: + self._reopen() + arr = self.h5file.root.somearray + + # Get and compare an element + ga = [i for i in a] + garr = [i for i in arr] + if common.verbose: + print("Result of original iterator:", ga) + print("Result of read generator:", garr) + self.assertEqual(ga, garr) + + def test01b_me(self): + """Testing generator access to Arrays, multiple elements (numeric)""" + + # Create the array under root and name 'somearray' + a = self.numericalListME + arr = self.h5file.create_array( + self.h5file.root, 'somearray', a, "Some array") + + if self.close: + self._reopen() + arr = self.h5file.root.somearray + + # Get and compare an element + ga = list(a) + garr = list(arr) + if common.verbose: + print("Result of original iterator:", ga) + print("Result of read generator:", garr) + for x_ga, x_garr in zip(ga, garr): + self.assertTrue(common.allequal(x_ga, x_garr)) + + +class GE1NATestCase(GeneratorTestCase): + title = "Rank-1 case 1" + numericalList = np.array([3]) + numericalListME = np.array([3, 2, 1, 0, 4, 5, 6]) + charList = np.array(["3"], 'S') + charListME = np.array( + ["321", "221", "121", "021", "421", "521", "621"], 'S') + + +class GE1NAOpenTestCase(GE1NATestCase): + close = 0 + + +class GE1NACloseTestCase(GE1NATestCase): + close = 1 + + +class GE2NATestCase(GeneratorTestCase): + # A more complex example + title = "Rank-1,2 case 2" + numericalList = np.array([3, 4]) + numericalListME = np.array([[3, 2, 1, 0, 4, 5, 6], + [2, 1, 0, 4, 5, 6, 7], + [4, 3, 2, 1, 0, 4, 5], + [3, 2, 1, 0, 4, 5, 6], + [3, 2, 1, 0, 4, 5, 6]]) + + charList = np.array(["a", "b"], 'S') + charListME = np.array( + [["321", "221", "121", "021", "421", "521", "621"], + ["21", "21", "11", "02", "42", "21", "61"], + ["31", "21", "12", "21", "41", "51", "621"], + ["321", "221", "121", "021", + "421", "521", "621"], + ["3241", "2321", "13216", + "0621", "4421", "5421", "a621"], + ["a321", "s221", "d121", "g021", "b421", "5vvv21", "6zxzxs21"]], 'S') + + +class GE2NAOpenTestCase(GE2NATestCase): + close = 0 + + +class GE2NACloseTestCase(GE2NATestCase): + close = 1 + + +class NonHomogeneousTestCase(common.TempFileMixin, common.PyTablesTestCase): + def test(self): + """Test for creation of non-homogeneous arrays.""" + + # This checks ticket #12. + self.assertRaises((ValueError, TypeError), + self.h5file.create_array, '/', 'test', [1, [2, 3]]) + self.assertRaises(tb.NoSuchNodeError, self.h5file.remove_node, '/test') + + +class TruncateTestCase(common.TempFileMixin, common.PyTablesTestCase): + def test(self): + """Test for unability to truncate Array objects.""" + + array1 = self.h5file.create_array('/', 'array1', [0, 2]) + self.assertRaises(TypeError, array1.truncate, 0) + + +class PointSelectionTestCase(common.TempFileMixin, common.PyTablesTestCase): + + def setUp(self): + super().setUp() + # Limits for selections + self.limits = [ + (0, 1), # just one element + (20, -10), # no elements + (-10, 4), # several elements + (0, 10), # several elements (again) + ] + + # Create a sample array + size = np.prod(self.shape) + nparr = np.arange(size, dtype=np.int32).reshape(self.shape) + self.nparr = nparr + self.tbarr = self.h5file.create_array(self.h5file.root, 'array', nparr) + + def test01a_read(self): + """Test for point-selections (read, boolean keys).""" + + nparr = self.nparr + tbarr = self.tbarr + for value1, value2 in self.limits: + key = (nparr >= value1) & (nparr < value2) + if common.verbose: + print("Selection to test:", key) + a = nparr[key] + b = tbarr[key] + self.assertTrue( + np.alltrue(a == b), + "NumPy array and PyTables selections does not match.") + + def test01b_read(self): + """Test for point-selections (read, integer keys).""" + + nparr = self.nparr + tbarr = self.tbarr + for value1, value2 in self.limits: + key = np.where((nparr >= value1) & (nparr < value2)) + if common.verbose: + print("Selection to test:", key) + a = nparr[key] + b = tbarr[key] + self.assertTrue( + np.alltrue(a == b), + "NumPy array and PyTables selections does not match.") + + def test01c_read(self): + """Test for point-selections (read, float keys).""" + + nparr = self.nparr + tbarr = self.tbarr + for value1, value2 in self.limits: + key = np.where((nparr >= value1) & (nparr < value2)) + if common.verbose: + print("Selection to test:", key) + # a = nparr[key] + fkey = np.array(key, "f4") + self.assertRaises((IndexError, TypeError), tbarr.__getitem__, fkey) + + def test01d_read(self): + nparr = self.nparr + tbarr = self.tbarr + + for key in self.working_keyset: + if nparr.ndim > 1: + key = tuple(key) + if common.verbose: + print("Selection to test:", key) + a = nparr[key] + b = tbarr[key] + np.testing.assert_array_equal( + a, b, "NumPy array and PyTables selections does not match.") + + def test01e_read(self): + tbarr = self.tbarr + + for key in self.not_working_keyset: + if common.verbose: + print("Selection to test:", key) + + self.assertRaises(IndexError, tbarr.__getitem__, key) + + def test02a_write(self): + """Test for point-selections (write, boolean keys).""" + + nparr = self.nparr + tbarr = self.tbarr + for value1, value2 in self.limits: + key = (nparr >= value1) & (nparr < value2) + if common.verbose: + print("Selection to test:", key) + s = nparr[key] + nparr[key] = s * 2 + tbarr[key] = s * 2 + a = nparr[:] + b = tbarr[:] + self.assertTrue( + np.alltrue(a == b), + "NumPy array and PyTables modifications does not match.") + + def test02b_write(self): + """Test for point-selections (write, integer keys).""" + + nparr = self.nparr + tbarr = self.tbarr + for value1, value2 in self.limits: + key = np.where((nparr >= value1) & (nparr < value2)) + if common.verbose: + print("Selection to test:", key) + s = nparr[key] + nparr[key] = s * 2 + tbarr[key] = s * 2 + a = nparr[:] + b = tbarr[:] + self.assertTrue( + np.alltrue(a == b), + "NumPy array and PyTables modifications does not match.") + + def test02c_write(self): + """Test for point-selections (write, integer values, broadcast).""" + + nparr = self.nparr + tbarr = self.tbarr + for value1, value2 in self.limits: + key = np.where((nparr >= value1) & (nparr < value2)) + if common.verbose: + print("Selection to test:", key) + # s = nparr[key] + nparr[key] = 2 # force a broadcast + tbarr[key] = 2 # force a broadcast + a = nparr[:] + b = tbarr[:] + self.assertTrue( + np.alltrue(a == b), + "NumPy array and PyTables modifications does not match.") + + +class PointSelection0(PointSelectionTestCase): + shape = (3,) + working_keyset = [ + [0, 1], + [0, -1], + ] + not_working_keyset = [ + [0, 3], + [0, 4], + [0, -4], + ] + + +class PointSelection1(PointSelectionTestCase): + shape = (5, 3, 3) + working_keyset = [ + [(0, 0), (0, 1), (0, 0)], + [(0, 0), (0, -1), (0, 0)], + ] + not_working_keyset = [ + [(0, 0), (0, 3), (0, 0)], + [(0, 0), (0, 4), (0, 0)], + [(0, 0), (0, -4), (0, 0)], + [(0, 0), (0, -5), (0, 0)] + ] + + +class PointSelection2(PointSelectionTestCase): + shape = (7, 3) + working_keyset = [ + [(0, 0), (0, 1)], + [(0, 0), (0, -1)], + [(0, 0), (0, -2)], + ] + not_working_keyset = [ + [(0, 0), (0, 3)], + [(0, 0), (0, 4)], + [(0, 0), (0, -4)], + [(0, 0), (0, -5)], + ] + + +class PointSelection3(PointSelectionTestCase): + shape = (4, 3, 2, 1) + working_keyset = [ + [(0, 0), (0, 1), (0, 0), (0, 0)], + [(0, 0), (0, -1), (0, 0), (0, 0)], + ] + not_working_keyset = [ + [(0, 0), (0, 3), (0, 0), (0, 0)], + [(0, 0), (0, 4), (0, 0), (0, 0)], + [(0, 0), (0, -4), (0, 0), (0, 0)], + ] + + +class PointSelection4(PointSelectionTestCase): + shape = (1, 3, 2, 5, 6) + working_keyset = [ + [(0, 0), (0, 1), (0, 0), (0, 0), (0, 0)], + [(0, 0), (0, -1), (0, 0), (0, 0), (0, 0)], + ] + not_working_keyset = [ + [(0, 0), (0, 3), (0, 0), (0, 0), (0, 0)], + [(0, 0), (0, 4), (0, 0), (0, 0), (0, 0)], + [(0, 0), (0, -4), (0, 0), (0, 0), (0, 0)], + ] + + +class FancySelectionTestCase(common.TempFileMixin, common.PyTablesTestCase): + + def setUp(self): + super().setUp() + + m, n, o = self.shape + + # The next are valid selections for both NumPy and PyTables + self.working_keyset = [ + ([1, 3], slice(1, n-1), 2), + ([m-1, 1, 3, 2], slice(None), 2), # unordered lists supported + (slice(m), [n-1, 1, 0], slice(None)), + (slice(1, m, 3), slice(1, n), [o-1, 1, 0]), + (m-1, [2, 1], 1), + (1, 2, 1), # regular selection + ([1, 2], -2, -1), # negative indices + ([1, -2], 2, -1), # more negative indices + ([1, -2], 2, Ellipsis), # one ellipsis + (Ellipsis, [1, 2]), # one ellipsis + (np.array( + [1, -2], 'i4'), 2, -1), # array 32-bit instead of list + (np.array( + [-1, 2], 'i8'), 2, -1), # array 64-bit instead of list + ] + + # Using booleans instead of ints is deprecated since numpy 1.8 + # Tests for keys that have to support the __index__ attribute + # self.working_keyset.append( + # (False, True), # equivalent to (0,1) ;-) + # ) + + # Valid selections for NumPy, but not for PyTables (yet) + # The next should raise an IndexError + self.not_working_keyset = [ + np.array([False, True], dtype="b1"), # boolean arrays + ([1, 2, 1], 2, 1), # repeated values + ([1, 2], 2, [1, 2]), # several lists + ([], 2, 1), # empty selections + (Ellipsis, [1, 2], Ellipsis), # several ellipsis + # Using booleans instead of ints is deprecated since numpy 1.8 + ([False, True]), # boolean values with incompatible shape + ] + + # The next should raise an IndexError in both NumPy and PyTables + self.not_working_oob = [ + ([1, 2], 2, 1000), # out-of-bounds selections + ([1, 2], 2000, 1), # out-of-bounds selections + ] + + # The next should raise a IndexError in both NumPy and PyTables + self.not_working_too_many = [ + ([1, 2], 2, 1, 1), + ] + + # Create a sample array + nparr = np.empty(self.shape, dtype=np.int32) + data = np.arange(n * o, dtype=np.int32).reshape(n, o) + for i in range(m): + nparr[i] = data * i + self.nparr = nparr + self.tbarr = self.h5file.create_array(self.h5file.root, 'array', nparr) + + def test01a_read(self): + """Test for fancy-selections (working selections, read).""" + + nparr = self.nparr + tbarr = self.tbarr + for key in self.working_keyset: + if common.verbose: + print("Selection to test:", key) + a = nparr[key] + b = tbarr[key] + self.assertTrue( + np.alltrue(a == b), + "NumPy array and PyTables selections does not match.") + + def test01b_read(self): + """Test for fancy-selections (not working selections, read).""" + + # nparr = self.nparr + tbarr = self.tbarr + for key in self.not_working_keyset: + if common.verbose: + print("Selection to test:", key) + # a = nparr[key] + self.assertRaises(IndexError, tbarr.__getitem__, key) + + def test01c_read(self): + """Test for fancy-selections (out-of-bound indexes, read).""" + + nparr = self.nparr + tbarr = self.tbarr + for key in self.not_working_oob: + if common.verbose: + print("Selection to test:", key) + self.assertRaises(IndexError, nparr.__getitem__, key) + self.assertRaises(IndexError, tbarr.__getitem__, key) + + def test01d_read(self): + """Test for fancy-selections (too many indexes, read).""" + + nparr = self.nparr + tbarr = self.tbarr + for key in self.not_working_too_many: + if common.verbose: + print("Selection to test:", key) + # ValueError for numpy 1.6.x and earlier + # IndexError in numpy > 1.8.0 + self.assertRaises((ValueError, IndexError), nparr.__getitem__, key) + self.assertRaises(IndexError, tbarr.__getitem__, key) + + def test02a_write(self): + """Test for fancy-selections (working selections, write).""" + + nparr = self.nparr + tbarr = self.tbarr + for key in self.working_keyset: + if common.verbose: + print("Selection to test:", key) + s = nparr[key] + nparr[key] = s * 2 + tbarr[key] = s * 2 + a = nparr[:] + b = tbarr[:] + self.assertTrue( + np.alltrue(a == b), + "NumPy array and PyTables modifications does not match.") + + def test02b_write(self): + """Test for fancy-selections (working selections, write, broadcast).""" + + nparr = self.nparr + tbarr = self.tbarr + for key in self.working_keyset: + if common.verbose: + print("Selection to test:", key) + # s = nparr[key] + nparr[key] = 2 # broadcast value + tbarr[key] = 2 # broadcast value + a = nparr[:] + b = tbarr[:] +# if common.verbose: +# print("NumPy modified array:", a) +# print("PyTables modifyied array:", b) + self.assertTrue( + np.alltrue(a == b), + "NumPy array and PyTables modifications does not match.") + + +class FancySelection1(FancySelectionTestCase): + shape = (5, 3, 3) # Minimum values + + +class FancySelection2(FancySelectionTestCase): + # shape = (5, 3, 3) # Minimum values + shape = (7, 3, 3) + + +class FancySelection3(FancySelectionTestCase): + # shape = (5, 3, 3) # Minimum values + shape = (7, 4, 5) + + +class FancySelection4(FancySelectionTestCase): + # shape = (5, 3, 3) # Minimum values + shape = (5, 3, 10) + + +class CopyNativeHDF5MDAtom(common.PyTablesTestCase): + + def setUp(self): + super().setUp() + filename = common.test_filename("array_mdatom.h5") + self.h5file = tb.open_file(filename, "r") + self.arr = self.h5file.root.arr + self.copy = tempfile.mktemp(".h5") + self.copyh = tb.open_file(self.copy, mode="w") + self.arr2 = self.arr.copy(self.copyh.root, newname="arr2") + + def tearDown(self): + self.h5file.close() + self.copyh.close() + Path(self.copy).unlink() + super().tearDown() + + def test01_copy(self): + """Checking that native MD atoms are copied as-is""" + + self.assertEqual(self.arr.atom, self.arr2.atom) + self.assertEqual(self.arr.shape, self.arr2.shape) + + def test02_reopen(self): + """Checking that native MD atoms are copied as-is (re-open)""" + + self.copyh.close() + self.copyh = tb.open_file(self.copy, mode="r") + self.arr2 = self.copyh.root.arr2 + self.assertEqual(self.arr.atom, self.arr2.atom) + self.assertEqual(self.arr.shape, self.arr2.shape) + + +class AccessClosedTestCase(common.TempFileMixin, common.PyTablesTestCase): + + def setUp(self): + super().setUp() + + a = np.zeros((10, 10)) + self.array = self.h5file.create_array(self.h5file.root, 'array', a) + + def test_read(self): + self.h5file.close() + self.assertRaises(tb.ClosedNodeError, self.array.read) + + def test_getitem(self): + self.h5file.close() + self.assertRaises(tb.ClosedNodeError, self.array.__getitem__, 0) + + def test_setitem(self): + self.h5file.close() + self.assertRaises(tb.ClosedNodeError, self.array.__setitem__, 0, 0) + + +class BroadcastTest(common.TempFileMixin, common.PyTablesTestCase): + + def test(self): + """Test correct broadcasting when the array atom is not scalar.""" + + array_shape = (2, 3) + element_shape = (3,) + + dtype = np.dtype((np.int64, element_shape)) + atom = tb.Atom.from_dtype(dtype) + h5arr = self.h5file.create_array(self.h5file.root, 'array', + atom=atom, shape=array_shape) + + size = np.prod(element_shape) + nparr = np.arange(size).reshape(element_shape) + + h5arr[0] = nparr + self.assertTrue(np.all(h5arr[0] == nparr)) + + +class TestCreateArrayArgs(common.TempFileMixin, common.PyTablesTestCase): + where = '/' + name = 'array' + obj = np.array([[1, 2], [3, 4]]) + title = 'title' + byteorder = None + createparents = False + atom = tb.Atom.from_dtype(obj.dtype) + shape = obj.shape + + def test_positional_args(self): + self.h5file.create_array(self.where, self.name, self.obj, self.title) + self.h5file.close() + + self.h5file = tb.open_file(self.h5fname) + ptarr = self.h5file.get_node(self.where, self.name) + nparr = ptarr.read() + + self.assertEqual(ptarr.title, self.title) + self.assertEqual(ptarr.shape, self.shape) + self.assertEqual(ptarr.atom, self.atom) + self.assertEqual(ptarr.atom.dtype, self.atom.dtype) + self.assertTrue(common.allequal(self.obj, nparr)) + + def test_positional_args_atom_shape(self): + self.h5file.create_array(self.where, self.name, None, self.title, + self.byteorder, self.createparents, + self.atom, self.shape) + self.h5file.close() + + self.h5file = tb.open_file(self.h5fname) + ptarr = self.h5file.get_node(self.where, self.name) + nparr = ptarr.read() + + self.assertEqual(ptarr.title, self.title) + self.assertEqual(ptarr.shape, self.shape) + self.assertEqual(ptarr.atom, self.atom) + self.assertEqual(ptarr.atom.dtype, self.atom.dtype) + self.assertTrue(common.allequal(np.zeros_like(self.obj), nparr)) + + def test_kwargs_obj(self): + self.h5file.create_array(self.where, self.name, title=self.title, + obj=self.obj) + self.h5file.close() + + self.h5file = tb.open_file(self.h5fname) + ptarr = self.h5file.get_node(self.where, self.name) + nparr = ptarr.read() + + self.assertEqual(ptarr.title, self.title) + self.assertEqual(ptarr.shape, self.shape) + self.assertEqual(ptarr.atom, self.atom) + self.assertEqual(ptarr.atom.dtype, self.atom.dtype) + self.assertTrue(common.allequal(self.obj, nparr)) + + def test_kwargs_atom_shape_01(self): + ptarr = self.h5file.create_array(self.where, self.name, + title=self.title, + atom=self.atom, shape=self.shape) + ptarr[...] = self.obj + self.h5file.close() + + self.h5file = tb.open_file(self.h5fname) + ptarr = self.h5file.get_node(self.where, self.name) + nparr = ptarr.read() + + self.assertEqual(ptarr.title, self.title) + self.assertEqual(ptarr.shape, self.shape) + self.assertEqual(ptarr.atom, self.atom) + self.assertEqual(ptarr.atom.dtype, self.atom.dtype) + self.assertTrue(common.allequal(self.obj, nparr)) + + def test_kwargs_atom_shape_02(self): + ptarr = self.h5file.create_array(self.where, self.name, + title=self.title, + atom=self.atom, shape=self.shape) + # ptarr[...] = self.obj + self.h5file.close() + + self.h5file = tb.open_file(self.h5fname) + ptarr = self.h5file.get_node(self.where, self.name) + nparr = ptarr.read() + + self.assertEqual(ptarr.title, self.title) + self.assertEqual(ptarr.shape, self.shape) + self.assertEqual(ptarr.atom, self.atom) + self.assertEqual(ptarr.atom.dtype, self.atom.dtype) + self.assertTrue(common.allequal(np.zeros_like(self.obj), nparr)) + + def test_kwargs_obj_atom(self): + ptarr = self.h5file.create_array(self.where, self.name, + title=self.title, + obj=self.obj, + atom=self.atom) + self.h5file.close() + + self.h5file = tb.open_file(self.h5fname) + ptarr = self.h5file.get_node(self.where, self.name) + nparr = ptarr.read() + + self.assertEqual(ptarr.title, self.title) + self.assertEqual(ptarr.shape, self.shape) + self.assertEqual(ptarr.atom, self.atom) + self.assertEqual(ptarr.atom.dtype, self.atom.dtype) + self.assertTrue(common.allequal(self.obj, nparr)) + + def test_kwargs_obj_shape(self): + ptarr = self.h5file.create_array(self.where, self.name, + title=self.title, + obj=self.obj, + shape=self.shape) + self.h5file.close() + + self.h5file = tb.open_file(self.h5fname) + ptarr = self.h5file.get_node(self.where, self.name) + nparr = ptarr.read() + + self.assertEqual(ptarr.title, self.title) + self.assertEqual(ptarr.shape, self.shape) + self.assertEqual(ptarr.atom, self.atom) + self.assertEqual(ptarr.atom.dtype, self.atom.dtype) + self.assertTrue(common.allequal(self.obj, nparr)) + + def test_kwargs_obj_atom_shape(self): + ptarr = self.h5file.create_array(self.where, self.name, + title=self.title, + obj=self.obj, + atom=self.atom, + shape=self.shape) + self.h5file.close() + + self.h5file = tb.open_file(self.h5fname) + ptarr = self.h5file.get_node(self.where, self.name) + nparr = ptarr.read() + + self.assertEqual(ptarr.title, self.title) + self.assertEqual(ptarr.shape, self.shape) + self.assertEqual(ptarr.atom, self.atom) + self.assertEqual(ptarr.atom.dtype, self.atom.dtype) + self.assertTrue(common.allequal(self.obj, nparr)) + + def test_kwargs_obj_atom_error(self): + atom = tb.Atom.from_dtype(np.dtype('complex')) + # shape = self.shape + self.shape + self.assertRaises(TypeError, + self.h5file.create_array, + self.where, + self.name, + title=self.title, + obj=self.obj, + atom=atom) + + def test_kwargs_obj_shape_error(self): + # atom = Atom.from_dtype(numpy.dtype('complex')) + shape = self.shape + self.shape + self.assertRaises(TypeError, + self.h5file.create_array, + self.where, + self.name, + title=self.title, + obj=self.obj, + shape=shape) + + def test_kwargs_obj_atom_shape_error_01(self): + atom = tb.Atom.from_dtype(np.dtype('complex')) + # shape = self.shape + self.shape + self.assertRaises(TypeError, + self.h5file.create_array, + self.where, + self.name, + title=self.title, + obj=self.obj, + atom=atom, + shape=self.shape) + + def test_kwargs_obj_atom_shape_error_02(self): + # atom = Atom.from_dtype(numpy.dtype('complex')) + shape = self.shape + self.shape + self.assertRaises(TypeError, + self.h5file.create_array, + self.where, + self.name, + title=self.title, + obj=self.obj, + atom=self.atom, + shape=shape) + + def test_kwargs_obj_atom_shape_error_03(self): + atom = tb.Atom.from_dtype(np.dtype('complex')) + shape = self.shape + self.shape + self.assertRaises(TypeError, + self.h5file.create_array, + self.where, + self.name, + title=self.title, + obj=self.obj, + atom=atom, + shape=shape) + + +def suite(): + theSuite = common.unittest.TestSuite() + niter = 1 + + for i in range(niter): + # The scalar case test should be refined in order to work + theSuite.addTest(common.unittest.makeSuite(Basic0DOneTestCase)) + theSuite.addTest(common.unittest.makeSuite(Basic0DTwoTestCase)) + # theSuite.addTest(unittest.makeSuite(Basic1DZeroTestCase)) + theSuite.addTest(common.unittest.makeSuite(Basic1DOneTestCase)) + theSuite.addTest(common.unittest.makeSuite(Basic1DTwoTestCase)) + theSuite.addTest(common.unittest.makeSuite(Basic1DThreeTestCase)) + theSuite.addTest(common.unittest.makeSuite(Basic2DOneTestCase)) + theSuite.addTest(common.unittest.makeSuite(Basic2DTwoTestCase)) + theSuite.addTest(common.unittest.makeSuite(Basic10DTestCase)) + # The 32 dimensions case is tested on GroupsArray + # theSuite.addTest(unittest.makeSuite(Basic32DTestCase)) + theSuite.addTest(common.unittest.makeSuite(ReadOutArgumentTests)) + theSuite.addTest(common.unittest.makeSuite( + SizeOnDiskInMemoryPropertyTestCase)) + theSuite.addTest(common.unittest.makeSuite(GroupsArrayTestCase)) + theSuite.addTest(common.unittest.makeSuite( + ComplexNotReopenNotEndianTestCase)) + theSuite.addTest(common.unittest.makeSuite( + ComplexReopenNotEndianTestCase)) + theSuite.addTest(common.unittest.makeSuite( + ComplexNotReopenEndianTestCase)) + theSuite.addTest(common.unittest.makeSuite( + ComplexReopenEndianTestCase)) + theSuite.addTest(common.unittest.makeSuite(CloseCopyTestCase)) + theSuite.addTest(common.unittest.makeSuite(OpenCopyTestCase)) + theSuite.addTest(common.unittest.makeSuite(CopyIndex1TestCase)) + theSuite.addTest(common.unittest.makeSuite(CopyIndex2TestCase)) + theSuite.addTest(common.unittest.makeSuite(CopyIndex3TestCase)) + theSuite.addTest(common.unittest.makeSuite(CopyIndex4TestCase)) + theSuite.addTest(common.unittest.makeSuite(CopyIndex5TestCase)) + theSuite.addTest(common.unittest.makeSuite(CopyIndex6TestCase)) + theSuite.addTest(common.unittest.makeSuite(CopyIndex7TestCase)) + theSuite.addTest(common.unittest.makeSuite(CopyIndex8TestCase)) + theSuite.addTest(common.unittest.makeSuite(CopyIndex9TestCase)) + theSuite.addTest(common.unittest.makeSuite(CopyIndex10TestCase)) + theSuite.addTest(common.unittest.makeSuite(CopyIndex11TestCase)) + theSuite.addTest(common.unittest.makeSuite(CopyIndex12TestCase)) + theSuite.addTest(common.unittest.makeSuite(GI1NAOpenTestCase)) + theSuite.addTest(common.unittest.makeSuite(GI1NACloseTestCase)) + theSuite.addTest(common.unittest.makeSuite(GI2NAOpenTestCase)) + theSuite.addTest(common.unittest.makeSuite(GI2NACloseTestCase)) + theSuite.addTest(common.unittest.makeSuite(SI1NAOpenTestCase)) + theSuite.addTest(common.unittest.makeSuite(SI1NACloseTestCase)) + theSuite.addTest(common.unittest.makeSuite(SI2NAOpenTestCase)) + theSuite.addTest(common.unittest.makeSuite(SI2NACloseTestCase)) + theSuite.addTest(common.unittest.makeSuite(GE1NAOpenTestCase)) + theSuite.addTest(common.unittest.makeSuite(GE1NACloseTestCase)) + theSuite.addTest(common.unittest.makeSuite(GE2NAOpenTestCase)) + theSuite.addTest(common.unittest.makeSuite(GE2NACloseTestCase)) + theSuite.addTest(common.unittest.makeSuite(NonHomogeneousTestCase)) + theSuite.addTest(common.unittest.makeSuite(TruncateTestCase)) + theSuite.addTest(common.unittest.makeSuite(FancySelection1)) + theSuite.addTest(common.unittest.makeSuite(FancySelection2)) + theSuite.addTest(common.unittest.makeSuite(FancySelection3)) + theSuite.addTest(common.unittest.makeSuite(FancySelection4)) + theSuite.addTest(common.unittest.makeSuite(PointSelection0)) + theSuite.addTest(common.unittest.makeSuite(PointSelection1)) + theSuite.addTest(common.unittest.makeSuite(PointSelection2)) + theSuite.addTest(common.unittest.makeSuite(PointSelection3)) + theSuite.addTest(common.unittest.makeSuite(PointSelection4)) + theSuite.addTest(common.unittest.makeSuite(CopyNativeHDF5MDAtom)) + theSuite.addTest(common.unittest.makeSuite(AccessClosedTestCase)) + theSuite.addTest(common.unittest.makeSuite(TestCreateArrayArgs)) + theSuite.addTest(common.unittest.makeSuite(BroadcastTest)) + + return theSuite + + +if __name__ == '__main__': + common.parse_argv(sys.argv) + common.print_versions() + common.unittest.main(defaultTest='suite') diff --git a/tables/tests/test_attributes.py b/tables/tests/test_attributes.py new file mode 100644 index 0000000..ed27bdd --- /dev/null +++ b/tables/tests/test_attributes.py @@ -0,0 +1,1891 @@ +"""This test unit checks node attributes that are persistent (AttributeSet).""" + +import datetime +import sys +import warnings +from packaging.version import Version + +import numpy as np + +import tables as tb +from tables.tests import common + + +class Record(tb.IsDescription): + var1 = tb.StringCol(itemsize=4) # 4-character String + var2 = tb.IntCol() # integer + var3 = tb.Int16Col() # short integer + var4 = tb.FloatCol() # double (double-precision) + var5 = tb.Float32Col() # float (single-precision) + + +class CreateTestCase(common.TempFileMixin, common.PyTablesTestCase): + def setUp(self): + super().setUp() + self.root = self.h5file.root + + # Create a table object + self.table = self.h5file.create_table(self.root, 'atable', + Record, "Table title") + # Create an array object + self.array = self.h5file.create_array(self.root, 'anarray', + [1], "Array title") + # Create a group object + self.group = self.h5file.create_group(self.root, 'agroup', + "Group title") + + def test01_setAttributes(self): + """Checking setting large string attributes (File methods)""" + + attrlength = 2048 + # Try to put a long string attribute on a group object + self.h5file.set_node_attr(self.root.agroup, "attr1", "p" * attrlength) + + # Now, try with a Table object + self.h5file.set_node_attr(self.root.atable, "attr1", "a" * attrlength) + + # Finally, try with an Array object + self.h5file.set_node_attr(self.root.anarray, "attr1", "n" * attrlength) + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen(mode='r+', node_cache_slots=self.node_cache_slots) + self.root = self.h5file.root + + self.assertEqual(self.h5file.get_node_attr(self.root.agroup, 'attr1'), + "p" * attrlength) + self.assertEqual(self.h5file.get_node_attr(self.root.atable, 'attr1'), + "a" * attrlength) + self.assertEqual(self.h5file.get_node_attr(self.root.anarray, 'attr1'), + "n" * attrlength) + + def reopen(self): + # Reopen + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen(mode='r+', node_cache_slots=self.node_cache_slots) + self.root = self.h5file.root + + def check_missing(self, name): + self.reopen() + self.assertNotIn(name, self.root.agroup._v_attrs) + self.assertNotIn(name, self.root.atable.attrs) + self.assertNotIn(name, self.root.anarray.attrs) + + def check_name(self, name, val=''): + """Check validity of attribute name filtering""" + self.check_missing(name) + # Using File methods + self.h5file.set_node_attr(self.root.agroup, name, val) + self.h5file.set_node_attr(self.root.atable, name, val) + self.h5file.set_node_attr(self.root.anarray, name, val) + # Check File methods + self.reopen() + self.assertEqual(self.h5file.get_node_attr(self.root.agroup, name), + val) + self.assertEqual(self.h5file.get_node_attr(self.root.atable, name), + val) + self.assertEqual(self.h5file.get_node_attr(self.root.anarray, name), + val) + # Remove, file methods + self.h5file.del_node_attr(self.root.agroup, name) + self.h5file.del_node_attr(self.root.atable, name) + self.h5file.del_node_attr(self.root.anarray, name) + self.check_missing(name) + + # Using Node methods + self.root.agroup._f_setattr(name, val) + self.root.atable.set_attr(name, val) + self.root.anarray.set_attr(name, val) + # Check Node methods + self.reopen() + self.assertEqual(self.root.agroup._f_getattr(name), val) + self.assertEqual(self.root.atable.get_attr(name), val) + self.assertEqual(self.root.anarray.get_attr(name), val) + self.root.agroup._f_delattr(name) + self.root.atable.del_attr(name) + self.root.anarray.del_attr(name) + self.check_missing(name) + + # Using AttributeSet methods + setattr(self.root.agroup._v_attrs, name, val) + setattr(self.root.atable.attrs, name, val) + setattr(self.root.anarray.attrs, name, val) + # Check AttributeSet methods + self.reopen() + self.assertEqual(getattr(self.root.agroup._v_attrs, name), val) + self.assertEqual(getattr(self.root.atable.attrs, name), val) + self.assertEqual(getattr(self.root.anarray.attrs, name), val) + delattr(self.root.agroup._v_attrs, name) + delattr(self.root.atable.attrs, name) + delattr(self.root.anarray.attrs, name) + self.check_missing(name) + + # Using dict [] + self.root.agroup._v_attrs[name] = val + self.root.atable.attrs[name] = val + self.root.anarray.attrs[name] = val + # Check dict [] + self.reopen() + self.assertEqual(self.root.agroup._v_attrs[name], val) + self.assertEqual(self.root.atable.attrs[name], val) + self.assertEqual(self.root.anarray.attrs[name], val) + del self.root.agroup._v_attrs[name] + del self.root.atable.attrs[name] + del self.root.anarray.attrs[name] + self.check_missing(name) + + def test01a_setAttributes(self): + """Checking attribute names validity""" + with warnings.catch_warnings(): + warnings.simplefilter('ignore', tb.NaturalNameWarning) + self.check_name('a') + self.check_name('a:b') + self.check_name('/a/b') + self.check_name('.') + self.assertRaises(ValueError, self.check_name, '') + self.assertRaises(ValueError, self.check_name, '__members__') + self.assertRaises(TypeError, self.check_name, 0) + + def test02_setAttributes(self): + """Checking setting large string attributes (Node methods)""" + + attrlength = 2048 + # Try to put a long string attribute on a group object + self.root.agroup._f_setattr('attr1', "p" * attrlength) + # Now, try with a Table object + self.root.atable.set_attr('attr1', "a" * attrlength) + + # Finally, try with an Array object + self.root.anarray.set_attr('attr1', "n" * attrlength) + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen(mode='r+', node_cache_slots=self.node_cache_slots) + self.root = self.h5file.root + + self.assertEqual(self.root.agroup._f_getattr( + 'attr1'), "p" * attrlength) + self.assertEqual(self.root.atable.get_attr("attr1"), "a" * attrlength) + self.assertEqual(self.root.anarray.get_attr("attr1"), "n" * attrlength) + + def test03_setAttributes(self): + """Checking setting large string attributes (AttributeSet methods)""" + + attrlength = 2048 + # Try to put a long string attribute on a group object + self.group._v_attrs.attr1 = "p" * attrlength + # Now, try with a Table object + self.table.attrs.attr1 = "a" * attrlength + # Finally, try with an Array object + self.array.attrs.attr1 = "n" * attrlength + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen(mode='r+', node_cache_slots=self.node_cache_slots) + self.root = self.h5file.root + + # This should work even when the node cache is disabled + self.assertEqual(self.root.agroup._v_attrs.attr1, "p" * attrlength) + self.assertEqual(self.root.atable.attrs.attr1, "a" * attrlength) + self.assertEqual(self.root.anarray.attrs.attr1, "n" * attrlength) + + def test04_listAttributes(self): + """Checking listing attributes.""" + + # With a Group object + self.group._v_attrs.pq = "1" + self.group._v_attrs.qr = "2" + self.group._v_attrs.rs = "3" + if common.verbose: + print("Attribute list:", self.group._v_attrs._f_list()) + + # Now, try with a Table object + self.table.attrs.a = "1" + self.table.attrs.c = "2" + self.table.attrs.b = "3" + if common.verbose: + print("Attribute list:", self.table.attrs._f_list()) + + # Finally, try with an Array object + self.array.attrs.k = "1" + self.array.attrs.j = "2" + self.array.attrs.i = "3" + if common.verbose: + print("Attribute list:", self.array.attrs._f_list()) + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen(mode='r+', node_cache_slots=self.node_cache_slots) + self.root = self.h5file.root + + agroup = self.root.agroup + self.assertEqual(agroup._v_attrs._f_list("user"), ["pq", "qr", "rs"]) + self.assertEqual(agroup._v_attrs._f_list("sys"), + ['CLASS', 'TITLE', 'VERSION']) + self.assertEqual(agroup._v_attrs._f_list("all"), + ['CLASS', 'TITLE', 'VERSION', "pq", "qr", "rs"]) + + atable = self.root.atable + self.assertEqual(atable.attrs._f_list(), ["a", "b", "c"]) + self.assertEqual(atable.attrs._f_list("sys"), + ['CLASS', + 'FIELD_0_FILL', 'FIELD_0_NAME', + 'FIELD_1_FILL', 'FIELD_1_NAME', + 'FIELD_2_FILL', 'FIELD_2_NAME', + 'FIELD_3_FILL', 'FIELD_3_NAME', + 'FIELD_4_FILL', 'FIELD_4_NAME', + 'NROWS', + 'TITLE', 'VERSION']) + self.assertEqual(atable.attrs._f_list("all"), + ['CLASS', + 'FIELD_0_FILL', 'FIELD_0_NAME', + 'FIELD_1_FILL', 'FIELD_1_NAME', + 'FIELD_2_FILL', 'FIELD_2_NAME', + 'FIELD_3_FILL', 'FIELD_3_NAME', + 'FIELD_4_FILL', 'FIELD_4_NAME', + 'NROWS', + 'TITLE', 'VERSION', + "a", "b", "c"]) + + anarray = self.root.anarray + self.assertEqual(anarray.attrs._f_list(), ["i", "j", "k"]) + self.assertEqual( + anarray.attrs._f_list("sys"), + ['CLASS', 'FLAVOR', 'TITLE', 'VERSION']) + self.assertEqual( + anarray.attrs._f_list("all"), + ['CLASS', 'FLAVOR', 'TITLE', 'VERSION', "i", "j", "k"]) + + def test05_removeAttributes(self): + """Checking removing attributes.""" + + # With a Group object + self.group._v_attrs.pq = "1" + self.group._v_attrs.qr = "2" + self.group._v_attrs.rs = "3" + # delete an attribute + del self.group._v_attrs.pq + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen(mode='r+', node_cache_slots=self.node_cache_slots) + self.root = self.h5file.root + + agroup = self.root.agroup + if common.verbose: + print("Attribute list:", agroup._v_attrs._f_list()) + # Check the local attributes names + self.assertEqual(agroup._v_attrs._f_list(), ["qr", "rs"]) + if common.verbose: + print("Attribute list in disk:", agroup._v_attrs._f_list("all")) + # Check the disk attribute names + self.assertEqual(agroup._v_attrs._f_list("all"), + ['CLASS', 'TITLE', 'VERSION', "qr", "rs"]) + + # delete an attribute (__delattr__ method) + del agroup._v_attrs.qr + if common.verbose: + print("Attribute list:", agroup._v_attrs._f_list()) + # Check the local attributes names + self.assertEqual(agroup._v_attrs._f_list(), ["rs"]) + if common.verbose: + print("Attribute list in disk:", agroup._v_attrs._f_list()) + # Check the disk attribute names + self.assertEqual(agroup._v_attrs._f_list("all"), + ['CLASS', 'TITLE', 'VERSION', "rs"]) + + def test05b_removeAttributes(self): + """Checking removing attributes (using File.del_node_attr())""" + + # With a Group object + self.group._v_attrs.pq = "1" + self.group._v_attrs.qr = "2" + self.group._v_attrs.rs = "3" + # delete an attribute + self.h5file.del_node_attr(self.group, "pq") + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen(mode='r+', node_cache_slots=self.node_cache_slots) + self.root = self.h5file.root + + agroup = self.root.agroup + if common.verbose: + print("Attribute list:", agroup._v_attrs._f_list()) + # Check the local attributes names + self.assertEqual(agroup._v_attrs._f_list(), ["qr", "rs"]) + if common.verbose: + print("Attribute list in disk:", agroup._v_attrs._f_list("all")) + # Check the disk attribute names + self.assertEqual(agroup._v_attrs._f_list("all"), + ['CLASS', 'TITLE', 'VERSION', "qr", "rs"]) + + # delete an attribute (File.del_node_attr method) + self.h5file.del_node_attr(self.root, "qr", "agroup") + if common.verbose: + print("Attribute list:", agroup._v_attrs._f_list()) + # Check the local attributes names + self.assertEqual(agroup._v_attrs._f_list(), ["rs"]) + if common.verbose: + print("Attribute list in disk:", agroup._v_attrs._f_list()) + # Check the disk attribute names + self.assertEqual(agroup._v_attrs._f_list("all"), + ['CLASS', 'TITLE', 'VERSION', "rs"]) + + def test06_removeAttributes(self): + """Checking removing system attributes.""" + + # remove a system attribute + if common.verbose: + print("Before removing CLASS attribute") + print("System attrs:", self.group._v_attrs._v_attrnamessys) + del self.group._v_attrs.CLASS + self.assertEqual(self.group._v_attrs._f_list("sys"), + ['TITLE', 'VERSION']) + if common.verbose: + print("After removing CLASS attribute") + print("System attrs:", self.group._v_attrs._v_attrnamessys) + + def test07_renameAttributes(self): + """Checking renaming attributes.""" + + # With a Group object + self.group._v_attrs.pq = "1" + self.group._v_attrs.qr = "2" + self.group._v_attrs.rs = "3" + # rename an attribute + self.group._v_attrs._f_rename("pq", "op") + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen(mode='r+', node_cache_slots=self.node_cache_slots) + self.root = self.h5file.root + + agroup = self.root.agroup + if common.verbose: + print("Attribute list:", agroup._v_attrs._f_list()) + # Check the local attributes names (alphabetically sorted) + self.assertEqual(agroup._v_attrs._f_list(), ["op", "qr", "rs"]) + if common.verbose: + print("Attribute list in disk:", agroup._v_attrs._f_list("all")) + # Check the disk attribute names (not sorted) + self.assertEqual(agroup._v_attrs._f_list("all"), + ['CLASS', 'TITLE', 'VERSION', "op", "qr", "rs"]) + + def test08_renameAttributes(self): + """Checking renaming system attributes.""" + + if common.verbose: + print("Before renaming CLASS attribute") + print("All attrs:", self.group._v_attrs._v_attrnames) + # rename a system attribute + self.group._v_attrs._f_rename("CLASS", "op") + if common.verbose: + print("After renaming CLASS attribute") + print("All attrs:", self.group._v_attrs._v_attrnames) + + # Check the disk attribute names (not sorted) + agroup = self.root.agroup + self.assertEqual(agroup._v_attrs._f_list("all"), + ['TITLE', 'VERSION', "op"]) + + def test09_overwriteAttributes(self): + """Checking overwriting attributes.""" + + # With a Group object + self.group._v_attrs.pq = "1" + self.group._v_attrs.qr = "2" + self.group._v_attrs.rs = "3" + # overwrite attributes + self.group._v_attrs.pq = "4" + self.group._v_attrs.qr = 2 + self.group._v_attrs.rs = [1, 2, 3] + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen(mode='r+', node_cache_slots=self.node_cache_slots) + self.root = self.h5file.root + + agroup = self.root.agroup + if common.verbose: + print("Value of Attribute pq:", agroup._v_attrs.pq) + # Check the local attributes names (alphabetically sorted) + self.assertEqual(agroup._v_attrs.pq, "4") + self.assertEqual(agroup._v_attrs.qr, 2) + self.assertEqual(agroup._v_attrs.rs, [1, 2, 3]) + if common.verbose: + print("Attribute list in disk:", agroup._v_attrs._f_list("all")) + # Check the disk attribute names (not sorted) + self.assertEqual(agroup._v_attrs._f_list("all"), + ['CLASS', 'TITLE', 'VERSION', "pq", "qr", "rs"]) + + def test10a_copyAttributes(self): + """Checking copying attributes.""" + + # With a Group object + self.group._v_attrs.pq = "1" + self.group._v_attrs.qr = "2" + self.group._v_attrs.rs = "3" + # copy all attributes from "/agroup" to "/atable" + self.group._v_attrs._f_copy(self.root.atable) + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen(mode='r+', node_cache_slots=self.node_cache_slots) + self.root = self.h5file.root + + atable = self.root.atable + if common.verbose: + print("Attribute list:", atable._v_attrs._f_list()) + # Check the local attributes names (alphabetically sorted) + self.assertEqual(atable._v_attrs._f_list(), ["pq", "qr", "rs"]) + if common.verbose: + print("Complete attribute list:", atable._v_attrs._f_list("all")) + # Check the disk attribute names (not sorted) + self.assertEqual(atable._v_attrs._f_list("all"), + ['CLASS', + 'FIELD_0_FILL', 'FIELD_0_NAME', + 'FIELD_1_FILL', 'FIELD_1_NAME', + 'FIELD_2_FILL', 'FIELD_2_NAME', + 'FIELD_3_FILL', 'FIELD_3_NAME', + 'FIELD_4_FILL', 'FIELD_4_NAME', + 'NROWS', + 'TITLE', 'VERSION', + "pq", "qr", "rs"]) + + def test10b_copyAttributes(self): + """Checking copying attributes (copy_node_attrs)""" + + # With a Group object + self.group._v_attrs.pq = "1" + self.group._v_attrs.qr = "2" + self.group._v_attrs.rs = "3" + # copy all attributes from "/agroup" to "/atable" + self.h5file.copy_node_attrs(self.group, self.root.atable) + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen(mode='r+', node_cache_slots=self.node_cache_slots) + self.root = self.h5file.root + + atable = self.root.atable + if common.verbose: + print("Attribute list:", atable._v_attrs._f_list()) + # Check the local attributes names (alphabetically sorted) + self.assertEqual(atable._v_attrs._f_list(), ["pq", "qr", "rs"]) + if common.verbose: + print("Complete attribute list:", atable._v_attrs._f_list("all")) + # Check the disk attribute names (not sorted) + self.assertEqual(atable._v_attrs._f_list("all"), + ['CLASS', + 'FIELD_0_FILL', 'FIELD_0_NAME', + 'FIELD_1_FILL', 'FIELD_1_NAME', + 'FIELD_2_FILL', 'FIELD_2_NAME', + 'FIELD_3_FILL', 'FIELD_3_NAME', + 'FIELD_4_FILL', 'FIELD_4_NAME', + 'NROWS', + 'TITLE', 'VERSION', + "pq", "qr", "rs"]) + + def test10c_copyAttributes(self): + """Checking copying attributes during group copies.""" + + # With a Group object + self.group._v_attrs['CLASS'] = "GROUP2" + self.group._v_attrs['VERSION'] = "1.3" + # copy "/agroup" to "/agroup2" + self.h5file.copy_node(self.group, self.root, "agroup2") + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen(mode='r+', node_cache_slots=self.node_cache_slots) + self.root = self.h5file.root + + agroup2 = self.root.agroup2 + if common.verbose: + print("Complete attribute list:", agroup2._v_attrs._f_list("all")) + self.assertEqual(agroup2._v_attrs['CLASS'], "GROUP2") + self.assertEqual(agroup2._v_attrs['VERSION'], "1.3") + + def test10d_copyAttributes(self): + """Checking copying attributes during leaf copies.""" + + # With a Group object + atable = self.root.atable + atable._v_attrs['CLASS'] = "TABLE2" + atable._v_attrs['VERSION'] = "1.3" + # copy "/agroup" to "/agroup2" + self.h5file.copy_node(atable, self.root, "atable2") + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen(mode='r+', node_cache_slots=self.node_cache_slots) + self.root = self.h5file.root + + atable2 = self.root.atable2 + if common.verbose: + print("Complete attribute list:", atable2._v_attrs._f_list("all")) + self.assertEqual(atable2._v_attrs['CLASS'], "TABLE2") + self.assertEqual(atable2._v_attrs['VERSION'], "1.3") + + def test11a_getitem(self): + """Checking the __getitem__ interface.""" + + attrs = self.group._v_attrs + attrs.pq = "1" + self.assertEqual(attrs['pq'], "1") + + def test11b_setitem(self): + """Checking the __setitem__ interface.""" + + attrs = self.group._v_attrs + attrs['pq'] = "2" + self.assertEqual(attrs['pq'], "2") + + def test11c_delitem(self): + """Checking the __delitem__ interface.""" + + attrs = self.group._v_attrs + attrs.pq = "1" + del attrs['pq'] + self.assertNotIn('pq', attrs._f_list()) + + def test11d_KeyError(self): + """Checking that KeyError is raised in __getitem__/__delitem__.""" + + attrs = self.group._v_attrs + self.assertRaises(KeyError, attrs.__getitem__, 'pq') + self.assertRaises(KeyError, attrs.__delitem__, 'pq') + + def test_2d_non_contiguous(self): + """Checking setting 2D and non-contiguous NumPy attributes""" + + # Regression for gh-176 numpy. + # In the views old implementation PyTAbles performa a copy of the + # array: + # + # value = numpy.array(value) + # + # in order to get a contiguous array. + # Unfortunately array with swapped axis are copyed as they are so + # thay are stored in to HDF5 attributes without being actually + # contiguous and ths causes an error whn they are restored. + + data = np.array([[0, 1], [2, 3]]) + + self.array.attrs['a'] = data + self.array.attrs['b'] = data.T.copy() + self.array.attrs['c'] = data.T + + np.testing.assert_array_equal(self.array.attrs['a'], data) + np.testing.assert_array_equal(self.array.attrs['b'], data.T) + # AssertionError: + np.testing.assert_array_equal(self.array.attrs['c'], data.T) + + def test12_dir(self): + """Checking AttributeSet.__dir__""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test12_dir..." % self.__class__.__name__) + + attrset = self.group._v_attrs + + user_attr = 'good_attr' + sys_attr = 'BETTER_ATTR' + for a in [user_attr, sys_attr]: + attrset[a] = 1 + + bad_user = '5bad' + bad_sys = 'SYS%' + for a in [bad_user, bad_sys]: + with warnings.catch_warnings(): + warnings.simplefilter('ignore', tb.NaturalNameWarning) + attrset[a] = 1 + + completions = dir(attrset) + + # Check some regular attributes. + self.assertIn('__class__', completions) + self.assertIn('_f_copy', completions) + self.assertEqual(completions.count('_f_copy'), 1) + + # Check SYS attrs. + self.assertNotIn(bad_sys, completions) + self.assertIn(sys_attr, completions) + self.assertEqual(completions.count(sys_attr), 1) + + # Check USER attrs. + self.assertIn(user_attr, completions) + self.assertNotIn(bad_user, completions) + self.assertEqual(completions.count(user_attr), 1) + + # Now check all for no duplicates. + self.assertSequenceEqual(sorted(set(completions)), + sorted(completions)) + + +class NotCloseCreate(CreateTestCase): + close = False + node_cache_slots = tb.parameters.NODE_CACHE_SLOTS + open_kwargs = dict(node_cache_slots=node_cache_slots) + + +class CloseCreate(CreateTestCase): + close = True + node_cache_slots = tb.parameters.NODE_CACHE_SLOTS + open_kwargs = dict(node_cache_slots=node_cache_slots) + + +class NoCacheNotCloseCreate(CreateTestCase): + close = False + node_cache_slots = 0 + open_kwargs = dict(node_cache_slots=node_cache_slots) + + +class NoCacheCloseCreate(CreateTestCase): + close = True + node_cache_slots = 0 + open_kwargs = dict(node_cache_slots=node_cache_slots) + + +class DictCacheNotCloseCreate(CreateTestCase): + close = False + node_cache_slots = -tb.parameters.NODE_CACHE_SLOTS + open_kwargs = dict(node_cache_slots=node_cache_slots) + + +class DictCacheCloseCreate(CreateTestCase): + close = True + node_cache_slots = -tb.parameters.NODE_CACHE_SLOTS + open_kwargs = dict(node_cache_slots=node_cache_slots) + + +class TypesTestCase(common.TempFileMixin, common.PyTablesTestCase): + + def setUp(self): + self.open_kwargs = {'allow_padding': self.allow_padding} + super().setUp() + self.root = self.h5file.root + + # Create an array object + self.array = self.h5file.create_array(self.root, 'anarray', + [1], "Array title") + # Create a group object + self.group = self.h5file.create_group(self.root, 'agroup', + "Group title") + + def test00a_setBoolAttributes(self): + """Checking setting Bool attributes (scalar, Python case)""" + + self.array.attrs.pq = True + self.array.attrs.qr = False + self.array.attrs.rs = True + + # Check the results + if common.verbose: + print("pq -->", self.array.attrs.pq) + print("qr -->", self.array.attrs.qr) + print("rs -->", self.array.attrs.rs) + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen(mode='r+') + self.root = self.h5file.root + self.array = self.h5file.root.anarray + + self.assertEqual(self.root.anarray.attrs.pq, True) + self.assertEqual(self.root.anarray.attrs.qr, False) + self.assertEqual(self.root.anarray.attrs.rs, True) + + def test00b_setBoolAttributes(self): + """Checking setting Bool attributes (scalar, NumPy case)""" + + self.array.attrs.pq = np.bool_(True) + self.array.attrs.qr = np.bool_(False) + self.array.attrs.rs = np.bool_(True) + + # Check the results + if common.verbose: + print("pq -->", self.array.attrs.pq) + print("qr -->", self.array.attrs.qr) + print("rs -->", self.array.attrs.rs) + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen(mode='r+') + self.root = self.h5file.root + self.array = self.h5file.root.anarray + + self.assertIsInstance(self.root.anarray.attrs.pq, np.bool_) + self.assertIsInstance(self.root.anarray.attrs.qr, np.bool_) + self.assertIsInstance(self.root.anarray.attrs.rs, np.bool_) + self.assertEqual(self.root.anarray.attrs.pq, True) + self.assertEqual(self.root.anarray.attrs.qr, False) + self.assertEqual(self.root.anarray.attrs.rs, True) + + def test00c_setBoolAttributes(self): + """Checking setting Bool attributes (NumPy, 0-dim case)""" + + self.array.attrs.pq = np.array(True) + self.array.attrs.qr = np.array(False) + self.array.attrs.rs = np.array(True) + + # Check the results + if common.verbose: + print("pq -->", self.array.attrs.pq) + print("qr -->", self.array.attrs.qr) + print("rs -->", self.array.attrs.rs) + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen(mode='r+') + self.root = self.h5file.root + self.array = self.h5file.root.anarray + + self.assertEqual(self.root.anarray.attrs.pq, True) + self.assertEqual(self.root.anarray.attrs.qr, False) + self.assertEqual(self.root.anarray.attrs.rs, True) + + def test00d_setBoolAttributes(self): + """Checking setting Bool attributes (NumPy, multidim case)""" + + self.array.attrs.pq = np.array([True]) + self.array.attrs.qr = np.array([[False]]) + self.array.attrs.rs = np.array([[True, False], [True, False]]) + + # Check the results + if common.verbose: + print("pq -->", self.array.attrs.pq) + print("qr -->", self.array.attrs.qr) + print("rs -->", self.array.attrs.rs) + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen(mode='r+') + self.root = self.h5file.root + self.array = self.h5file.root.anarray + + np.testing.assert_array_equal(self.root.anarray.attrs.pq, + np.array([True])) + np.testing.assert_array_equal(self.root.anarray.attrs.qr, + np.array([[False]])) + np.testing.assert_array_equal(self.root.anarray.attrs.rs, + np.array([[True, False], [True, False]])) + + def test01a_setIntAttributes(self): + """Checking setting Int attributes (scalar, Python case)""" + + self.array.attrs.pq = 1 + self.array.attrs.qr = 2 + self.array.attrs.rs = 3 + + # Check the results + if common.verbose: + print("pq -->", self.array.attrs.pq) + print("qr -->", self.array.attrs.qr) + print("rs -->", self.array.attrs.rs) + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen(mode='r+') + self.root = self.h5file.root + self.array = self.h5file.root.anarray + + self.assertIsInstance(self.root.anarray.attrs.pq, np.int_) + self.assertIsInstance(self.root.anarray.attrs.qr, np.int_) + self.assertIsInstance(self.root.anarray.attrs.rs, np.int_) + self.assertEqual(self.root.anarray.attrs.pq, 1) + self.assertEqual(self.root.anarray.attrs.qr, 2) + self.assertEqual(self.root.anarray.attrs.rs, 3) + + def test01b_setIntAttributes(self): + """Checking setting Int attributes (scalar, NumPy case)""" + + # 'UInt64' not supported on Win + checktypes = ['int8', 'int16', 'int32', 'int64', + 'uint8', 'uint16', 'uint32'] + + for dtype in checktypes: + setattr(self.array.attrs, dtype, np.array(1, dtype=dtype)) + + # Check the results + if common.verbose: + for dtype in checktypes: + print("type, value-->", dtype, + getattr(self.array.attrs, dtype)) + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen(mode='r+') + self.root = self.h5file.root + self.array = self.h5file.root.anarray + + for dtype in checktypes: + np.testing.assert_array_equal(getattr(self.array.attrs, dtype), + np.array(1, dtype=dtype)) + + def test01c_setIntAttributes(self): + """Checking setting Int attributes (unidimensional NumPy case)""" + + # 'UInt64' not supported on Win + checktypes = ['int8', 'int16', 'int32', 'int64', + 'uint8', 'uint16', 'uint32'] + + for dtype in checktypes: + setattr(self.array.attrs, dtype, np.array([1, 2], dtype=dtype)) + + # Check the results + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen(mode='r+') + self.root = self.h5file.root + self.array = self.h5file.root.anarray + + for dtype in checktypes: + if common.verbose: + print("type, value-->", dtype, + getattr(self.array.attrs, dtype)) + np.testing.assert_array_equal(getattr(self.array.attrs, dtype), + np.array([1, 2], dtype=dtype)) + + def test01d_setIntAttributes(self): + """Checking setting Int attributes (unidimensional, non-contiguous)""" + + # 'UInt64' not supported on Win + checktypes = ['int8', 'int16', 'int32', 'int64', + 'uint8', 'uint16', 'uint32'] + + for dtype in checktypes: + arr = np.array([1, 2, 3, 4], dtype=dtype)[::2] + setattr(self.array.attrs, dtype, arr) + + # Check the results + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen(mode='r+') + self.root = self.h5file.root + self.array = self.h5file.root.anarray + + for dtype in checktypes: + arr = np.array([1, 2, 3, 4], dtype=dtype)[::2] + if common.verbose: + print("type, value-->", dtype, + getattr(self.array.attrs, dtype)) + np.testing.assert_array_equal(getattr(self.array.attrs, dtype), + arr) + + def test01e_setIntAttributes(self): + """Checking setting Int attributes (bidimensional NumPy case)""" + + # 'UInt64' not supported on Win + checktypes = ['int8', 'int16', 'int32', 'int64', + 'uint8', 'uint16', 'uint32'] + + for dtype in checktypes: + setattr(self.array.attrs, dtype, + np.array([[1, 2], [2, 3]], dtype=dtype)) + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen(mode='r+') + self.root = self.h5file.root + self.array = self.h5file.root.anarray + + # Check the results + for dtype in checktypes: + if common.verbose: + print("type, value-->", dtype, + getattr(self.array.attrs, dtype)) + np.testing.assert_array_equal( + getattr(self.array.attrs, dtype), + np.array([[1, 2], [2, 3]], dtype=dtype)) + + def test02a_setFloatAttributes(self): + """Checking setting Float (double) attributes.""" + + # Set some attrs + self.array.attrs.pq = 1.0 + self.array.attrs.qr = 2.0 + self.array.attrs.rs = 3.0 + + # Check the results + if common.verbose: + print("pq -->", self.array.attrs.pq) + print("qr -->", self.array.attrs.qr) + print("rs -->", self.array.attrs.rs) + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen(mode='r+') + self.root = self.h5file.root + self.array = self.h5file.root.anarray + + self.assertIsInstance(self.root.anarray.attrs.pq, np.float_) + self.assertIsInstance(self.root.anarray.attrs.qr, np.float_) + self.assertIsInstance(self.root.anarray.attrs.rs, np.float_) + self.assertEqual(self.root.anarray.attrs.pq, 1.0) + self.assertEqual(self.root.anarray.attrs.qr, 2.0) + self.assertEqual(self.root.anarray.attrs.rs, 3.0) + + def test02b_setFloatAttributes(self): + """Checking setting Float attributes (scalar, NumPy case)""" + + checktypes = ['float32', 'float64'] + + for dtype in checktypes: + setattr(self.array.attrs, dtype, np.array(1.1, dtype=dtype)) + + # Check the results + if common.verbose: + for dtype in checktypes: + print("type, value-->", dtype, + getattr(self.array.attrs, dtype)) + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen(mode='r+') + self.root = self.h5file.root + self.array = self.h5file.root.anarray + + for dtype in checktypes: + # assert getattr(self.array.attrs, dtype) == 1.1 + # In order to make Float32 tests pass. This is legal, not a trick. + np.testing.assert_almost_equal(getattr(self.array.attrs, dtype), + 1.1) + + def test02c_setFloatAttributes(self): + """Checking setting Float attributes (unidimensional NumPy case)""" + + checktypes = ['float32', 'float64'] + + for dtype in checktypes: + setattr(self.array.attrs, dtype, np.array([1.1, 2.1], dtype=dtype)) + + # Check the results + if common.verbose: + for dtype in checktypes: + print("type, value-->", dtype, + getattr(self.array.attrs, dtype)) + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen(mode='r+') + self.root = self.h5file.root + self.array = self.h5file.root.anarray + + for dtype in checktypes: + np.testing.assert_array_equal(getattr(self.array.attrs, dtype), + np.array([1.1, 2.1], dtype=dtype)) + + def test02d_setFloatAttributes(self): + """Checking setting Float attributes (unidimensional, + non-contiguous)""" + + checktypes = ['float32', 'float64'] + + for dtype in checktypes: + arr = np.array([1.1, 2.1, 3.1, 4.1], dtype=dtype)[1::2] + setattr(self.array.attrs, dtype, arr) + + # Check the results + if common.verbose: + for dtype in checktypes: + print("type, value-->", dtype, + getattr(self.array.attrs, dtype)) + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen(mode='r+') + self.root = self.h5file.root + self.array = self.h5file.root.anarray + + for dtype in checktypes: + arr = np.array([1.1, 2.1, 3.1, 4.1], dtype=dtype)[1::2] + np.testing.assert_array_equal(getattr(self.array.attrs, dtype), + arr) + + def test02e_setFloatAttributes(self): + """Checking setting Int attributes (bidimensional NumPy case)""" + + checktypes = ['float32', 'float64'] + + for dtype in checktypes: + setattr(self.array.attrs, dtype, + np.array([[1.1, 2.1], [2.1, 3.1]], dtype=dtype)) + + # Check the results + if common.verbose: + for dtype in checktypes: + print("type, value-->", dtype, + getattr(self.array.attrs, dtype)) + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen(mode='r+') + self.root = self.h5file.root + self.array = self.h5file.root.anarray + + for dtype in checktypes: + np.testing.assert_array_equal( + getattr(self.array.attrs, dtype), + np.array([[1.1, 2.1], [2.1, 3.1]], dtype=dtype)) + + def test03_setObjectAttributes(self): + """Checking setting Object attributes.""" + + # Set some attrs + self.array.attrs.pq = [1.0, 2] + self.array.attrs.qr = (1, 2) + self.array.attrs.rs = {"ddf": 32.1, "dsd": 1} + + # Check the results + if common.verbose: + print("pq -->", self.array.attrs.pq) + print("qr -->", self.array.attrs.qr) + print("rs -->", self.array.attrs.rs) + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen(mode='r+') + self.root = self.h5file.root + self.array = self.h5file.root.anarray + + self.assertEqual(self.root.anarray.attrs.pq, [1.0, 2]) + self.assertEqual(self.root.anarray.attrs.qr, (1, 2)) + self.assertEqual(self.root.anarray.attrs.rs, {"ddf": 32.1, "dsd": 1}) + + def test04a_setStringAttributes(self): + """Checking setting string attributes (scalar case)""" + + self.array.attrs.pq = 'foo' + self.array.attrs.qr = 'bar' + self.array.attrs.rs = 'baz' + + # Check the results + if common.verbose: + print("pq -->", self.array.attrs.pq) + print("qr -->", self.array.attrs.qr) + print("rs -->", self.array.attrs.rs) + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen(mode='r+') + self.root = self.h5file.root + self.array = self.h5file.root.anarray + + self.assertIsInstance(self.root.anarray.attrs.pq, np.str_) + self.assertIsInstance(self.root.anarray.attrs.qr, np.str_) + self.assertIsInstance(self.root.anarray.attrs.rs, np.str_) + self.assertEqual(self.root.anarray.attrs.pq, 'foo') + self.assertEqual(self.root.anarray.attrs.qr, 'bar') + self.assertEqual(self.root.anarray.attrs.rs, 'baz') + + def test04b_setStringAttributes(self): + """Checking setting string attributes (unidimensional 1-elem case)""" + + self.array.attrs.pq = np.array(['foo']) + + # Check the results + if common.verbose: + print("pq -->", self.array.attrs.pq) + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen(mode='r+') + self.root = self.h5file.root + self.array = self.h5file.root.anarray + + np.testing.assert_array_equal(self.root.anarray.attrs.pq, + np.array(['foo'])) + + def test04c_setStringAttributes(self): + """Checking setting string attributes (empty unidimensional + 1-elem case)""" + + self.array.attrs.pq = np.array(['']) + + # Check the results + if common.verbose: + print("pq -->", self.array.attrs.pq) + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen(mode='r+') + self.root = self.h5file.root + self.array = self.h5file.root.anarray + if common.verbose: + print("pq -->", self.array.attrs.pq) + + np.testing.assert_array_equal(self.root.anarray.attrs.pq, + np.array([''])) + + def test04d_setStringAttributes(self): + """Checking setting string attributes (unidimensional 2-elem case)""" + + self.array.attrs.pq = np.array(['foo', 'bar3']) + + # Check the results + if common.verbose: + print("pq -->", self.array.attrs.pq) + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen(mode='r+') + self.root = self.h5file.root + self.array = self.h5file.root.anarray + + np.testing.assert_array_equal(self.root.anarray.attrs.pq, + np.array(['foo', 'bar3'])) + + def test04e_setStringAttributes(self): + """Checking setting string attributes (empty unidimensional + 2-elem case)""" + + self.array.attrs.pq = np.array(['', '']) + + # Check the results + if common.verbose: + print("pq -->", self.array.attrs.pq) + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen(mode='r+') + self.root = self.h5file.root + self.array = self.h5file.root.anarray + + np.testing.assert_array_equal(self.root.anarray.attrs.pq, + np.array(['', ''])) + + def test04f_setStringAttributes(self): + """Checking setting string attributes (bidimensional 4-elem case)""" + + self.array.attrs.pq = np.array([['foo', 'foo2'], ['foo3', 'foo4']]) + + # Check the results + if common.verbose: + print("pq -->", self.array.attrs.pq) + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen(mode='r+') + self.root = self.h5file.root + self.array = self.h5file.root.anarray + + np.testing.assert_array_equal(self.root.anarray.attrs.pq, + np.array([['foo', 'foo2'], + ['foo3', 'foo4']])) + + def test05a_setComplexAttributes(self): + """Checking setting Complex (python) attributes.""" + + # Set some attrs + self.array.attrs.pq = 1.0 + 2j + self.array.attrs.qr = 2.0 + 3j + self.array.attrs.rs = 3.0 + 4j + + # Check the results + if common.verbose: + print("pq -->", self.array.attrs.pq) + print("qr -->", self.array.attrs.qr) + print("rs -->", self.array.attrs.rs) + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen(mode='r+') + self.root = self.h5file.root + self.array = self.h5file.root.anarray + + self.assertIsInstance(self.root.anarray.attrs.pq, np.complex_) + self.assertIsInstance(self.root.anarray.attrs.qr, np.complex_) + self.assertIsInstance(self.root.anarray.attrs.rs, np.complex_) + self.assertEqual(self.root.anarray.attrs.pq, 1.0 + 2j) + self.assertEqual(self.root.anarray.attrs.qr, 2.0 + 3j) + self.assertEqual(self.root.anarray.attrs.rs, 3.0 + 4j) + + def test05b_setComplexAttributes(self): + """Checking setting Complex attributes (scalar, NumPy case)""" + + checktypes = ['complex64', 'complex128'] + + for dtype in checktypes: + setattr(self.array.attrs, dtype, np.array(1.1 + 2j, dtype=dtype)) + + # Check the results + if common.verbose: + for dtype in checktypes: + print("type, value-->", dtype, + getattr(self.array.attrs, dtype)) + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen(mode='r+') + self.root = self.h5file.root + self.array = self.h5file.root.anarray + + for dtype in checktypes: + # assert getattr(self.array.attrs, dtype) == 1.1 + 2j + # In order to make Complex32 tests pass. + np.testing.assert_almost_equal(getattr(self.array.attrs, dtype), + 1.1 + 2j) + + def test05c_setComplexAttributes(self): + """Checking setting Complex attributes (unidimensional NumPy case)""" + + checktypes = ['complex64', 'complex128'] + + for dtype in checktypes: + setattr(self.array.attrs, dtype, np.array([1.1, 2.1], dtype=dtype)) + + # Check the results + if common.verbose: + for dtype in checktypes: + print("type, value-->", dtype, + getattr(self.array.attrs, dtype)) + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen(mode='r+') + self.root = self.h5file.root + self.array = self.h5file.root.anarray + + for dtype in checktypes: + np.testing.assert_array_equal(getattr(self.array.attrs, dtype), + np.array([1.1, 2.1], dtype=dtype)) + + def test05d_setComplexAttributes(self): + """Checking setting Int attributes (bidimensional NumPy case)""" + + checktypes = ['complex64', 'complex128'] + + for dtype in checktypes: + setattr(self.array.attrs, dtype, + np.array([[1.1, 2.1], [2.1, 3.1]], dtype=dtype)) + + # Check the results + if common.verbose: + for dtype in checktypes: + print("type, value-->", + dtype, getattr(self.array.attrs, dtype)) + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen(mode='r+') + self.root = self.h5file.root + self.array = self.h5file.root.anarray + + for dtype in checktypes: + np.testing.assert_array_equal( + getattr(self.array.attrs, dtype), + np.array([[1.1, 2.1], [2.1, 3.1]], dtype=dtype)) + + def test06a_setUnicodeAttributes(self): + """Checking setting unicode attributes (scalar case)""" + + self.array.attrs.pq = 'para\u0140lel' + self.array.attrs.qr = '' # check #213 or gh-64 + self.array.attrs.rs = 'baz' + + # Check the results + if common.verbose: + if sys.platform != 'win32': + # It seems that Windows cannot print this + print("pq -->", repr(self.array.attrs.pq)) + # XXX: try to use repr instead + # print("pq -->", repr(self.array.attrs.pq)) + print("qr -->", self.array.attrs.qr) + print("rs -->", self.array.attrs.rs) + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen(mode='r+') + self.root = self.h5file.root + self.array = self.h5file.root.anarray + + self.assertIsInstance(self.array.attrs.pq, np.unicode_) + self.assertIsInstance(self.array.attrs.qr, np.unicode_) + self.assertIsInstance(self.array.attrs.rs, np.unicode_) + self.assertEqual(self.array.attrs.pq, 'para\u0140lel') + self.assertEqual(self.array.attrs.qr, '') + self.assertEqual(self.array.attrs.rs, 'baz') + + def test06b_setUnicodeAttributes(self): + """Checking setting unicode attributes (unidimensional 1-elem case)""" + + self.array.attrs.pq = np.array(['para\u0140lel']) + + # Check the results + if common.verbose: + print("pq -->", self.array.attrs.pq) + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen(mode='r+') + self.root = self.h5file.root + self.array = self.h5file.root.anarray + + np.testing.assert_array_equal(self.array.attrs.pq, + np.array(['para\u0140lel'])) + + def test06c_setUnicodeAttributes(self): + """Checking setting unicode attributes (empty unidimensional + 1-elem case)""" + + # The next raises a `TypeError` when unpickled. See: + # http://projects.scipy.org/numpy/ticket/1037 + # self.array.attrs.pq = numpy.array(['']) + self.array.attrs.pq = np.array([''], dtype="U1") + + # Check the results + if common.verbose: + print("pq -->", self.array.attrs.pq) + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen(mode='r+') + self.root = self.h5file.root + self.array = self.h5file.root.anarray + if common.verbose: + print("pq -->", repr(self.array.attrs.pq)) + + np.testing.assert_array_equal(self.array.attrs.pq, + np.array([''], dtype="U1")) + + def test06d_setUnicodeAttributes(self): + """Checking setting unicode attributes (unidimensional 2-elem case)""" + + self.array.attrs.pq = np.array(['para\u0140lel', 'bar3']) + + # Check the results + if common.verbose: + print("pq -->", self.array.attrs.pq) + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen(mode='r+') + self.root = self.h5file.root + self.array = self.h5file.root.anarray + + np.testing.assert_array_equal(self.array.attrs.pq, + np.array(['para\u0140lel', 'bar3'])) + + def test06e_setUnicodeAttributes(self): + """Checking setting unicode attributes (empty unidimensional + 2-elem case)""" + + self.array.attrs.pq = np.array(['', ''], dtype="U1") + + # Check the results + if common.verbose: + print("pq -->", self.array.attrs.pq) + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen(mode='r+') + self.root = self.h5file.root + self.array = self.h5file.root.anarray + + np.testing.assert_array_equal(self.array.attrs.pq, + np.array(['', ''], dtype="U1")) + + def test06f_setUnicodeAttributes(self): + """Checking setting unicode attributes (bidimensional 4-elem case)""" + + self.array.attrs.pq = np.array([['para\u0140lel', 'foo2'], + ['foo3', 'para\u0140lel4']]) + + # Check the results + if common.verbose: + print("pq -->", self.array.attrs.pq) + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen(mode='r+') + self.root = self.h5file.root + self.array = self.h5file.root.anarray + + np.testing.assert_array_equal(self.array.attrs.pq, + np.array([['para\u0140lel', 'foo2'], + ['foo3', 'para\u0140lel4']])) + + def test07a_setRecArrayAttributes(self): + """Checking setting RecArray (NumPy) attributes.""" + + dt = np.dtype('i4,f8', align=self.aligned) + # Set some attrs + self.array.attrs.pq = np.zeros(2, dt) + self.array.attrs.qr = np.ones((2, 2), dt) + self.array.attrs.rs = np.array([(1, 2.)], dt) + + # Check the results + if common.verbose: + print("pq -->", self.array.attrs.pq) + print("qr -->", self.array.attrs.qr) + print("rs -->", self.array.attrs.rs) + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen(mode='r+') + self.root = self.h5file.root + self.array = self.h5file.root.anarray + + self.assertIsInstance(self.array.attrs.pq, np.ndarray) + self.assertIsInstance(self.array.attrs.qr, np.ndarray) + self.assertIsInstance(self.array.attrs.rs, np.ndarray) + np.testing.assert_array_equal(self.array.attrs.pq, np.zeros(2, dt)) + np.testing.assert_array_equal(self.array.attrs.qr, + np.ones((2, 2), dt)) + np.testing.assert_array_equal(self.array.attrs.rs, + np.array([(1, 2.)], dt)) + + def test07b_setRecArrayAttributes(self): + """Checking setting nested RecArray (NumPy) attributes.""" + + # Build a nested dtype + dt = np.dtype([('f1', [('f1', 'i2'), ('f2', 'f8')])]) + # Set some attrs + self.array.attrs.pq = np.zeros(2, dt) + self.array.attrs.qr = np.ones((2, 2), dt) + self.array.attrs.rs = np.array([((1, 2.),)], dt) + + # Check the results + if common.verbose: + print("pq -->", self.array.attrs.pq) + print("qr -->", self.array.attrs.qr) + print("rs -->", self.array.attrs.rs) + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen(mode='r+') + self.root = self.h5file.root + self.array = self.h5file.root.anarray + + self.assertIsInstance(self.array.attrs.pq, np.ndarray) + self.assertIsInstance(self.array.attrs.qr, np.ndarray) + self.assertIsInstance(self.array.attrs.rs, np.ndarray) + np.testing.assert_array_equal(self.array.attrs.pq, + np.zeros(2, dt)) + np.testing.assert_array_equal(self.array.attrs.qr, + np.ones((2, 2), dt)) + np.testing.assert_array_equal(self.array.attrs.rs, + np.array([((1, 2),)], dt)) + + def test07c_setRecArrayAttributes(self): + """Checking setting multidim nested RecArray (NumPy) attributes.""" + + # Build a nested dtype + dt = np.dtype([('f1', [('f1', 'i2', (2,)), ('f2', 'f8')])], align=True) + + # Set some attrs + self.array.attrs.pq = np.zeros(2, dt) + self.array.attrs.qr = np.ones((2, 2), dt) + self.array.attrs.rs = np.array([(([1, 3], 2.),)], dt) + + # Check the results + if common.verbose: + print("pq -->", self.array.attrs.pq) + print("qr -->", self.array.attrs.qr) + print("rs -->", self.array.attrs.rs) + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen(mode='r+') + self.root = self.h5file.root + self.array = self.h5file.root.anarray + + self.assertIsInstance(self.array.attrs.pq, np.ndarray) + self.assertIsInstance(self.array.attrs.qr, np.ndarray) + self.assertIsInstance(self.array.attrs.rs, np.ndarray) + np.testing.assert_array_equal(self.array.attrs.pq, np.zeros(2, dt)) + np.testing.assert_array_equal(self.array.attrs.qr, np.ones((2, 2), dt)) + np.testing.assert_array_equal(self.array.attrs.rs, + np.array([(([1, 3], 2),)], dt)) + + def test08_setRecArrayNotAllowPadding(self): + """Checking setting aligned RecArray (NumPy) attributes with + `allow_aligned` param set to False when reopen.""" + + dt = np.dtype('i4,f8', align=self.aligned) + # Set some attrs + self.array.attrs.pq = np.zeros(2, dt) + self.array.attrs.qr = np.ones((2, 2), dt) + self.array.attrs.rs = np.array([(1, 2.)], dt) + + # Check the results + if common.verbose: + print("pq -->", self.array.attrs.pq) + print("qr -->", self.array.attrs.qr) + print("rs -->", self.array.attrs.rs) + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen(mode='r+', allow_align=False) + self.root = self.h5file.root + self.array = self.h5file.root.anarray + + self.assertIsInstance(self.array.attrs.pq, np.ndarray) + self.assertIsInstance(self.array.attrs.qr, np.ndarray) + self.assertIsInstance(self.array.attrs.rs, np.ndarray) + np.testing.assert_array_equal(self.array.attrs.pq, np.zeros(2, dt)) + np.testing.assert_array_equal(self.array.attrs.qr, np.ones((2, 2), dt)) + np.testing.assert_array_equal(self.array.attrs.rs, + np.array([(1, 2.)], dt)) + + +class NotCloseTypesTestCase(TypesTestCase): + allow_padding = False + aligned = False + close = False + + +class NoCloseAlignedTypesTestCase(TypesTestCase): + allow_padding = True + aligned = True + close = False + + +class CloseNotAlignedPaddedTypesTestCase(TypesTestCase): + allow_padding = False + aligned = False + close = True + + +class CloseTypesTestCase(TypesTestCase): + allow_padding = True + aligned = False + close = True + + +class CloseAlignedTypesTestCase(TypesTestCase): + allow_padding = False + aligned = True + close = True + + +class CloseAlignedPaddedTypesTestCase(TypesTestCase): + allow_padding = True + aligned = True + close = True + + +class NoSysAttrsTestCase(common.TempFileMixin, common.PyTablesTestCase): + open_kwargs = dict(pytables_sys_attrs=False) + + def setUp(self): + super().setUp() + self.root = self.h5file.root + + # Create a table object + self.table = self.h5file.create_table(self.root, 'atable', + Record, "Table title") + # Create an array object + self.array = self.h5file.create_array(self.root, 'anarray', + [1], "Array title") + # Create a group object + self.group = self.h5file.create_group(self.root, 'agroup', + "Group title") + + def test00_listAttributes(self): + """Checking listing attributes (no system attrs version).""" + + # With a Group object + self.group._v_attrs.pq = "1" + self.group._v_attrs.qr = "2" + self.group._v_attrs.rs = "3" + if common.verbose: + print("Attribute list:", self.group._v_attrs._f_list()) + + # Now, try with a Table object + self.table.attrs.a = "1" + self.table.attrs.c = "2" + self.table.attrs.b = "3" + if common.verbose: + print("Attribute list:", self.table.attrs._f_list()) + + # Finally, try with an Array object + self.array.attrs.k = "1" + self.array.attrs.j = "2" + self.array.attrs.i = "3" + if common.verbose: + print("Attribute list:", self.array.attrs._f_list()) + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen(mode='r+') + self.root = self.h5file.root + + agroup = self.root.agroup + self.assertEqual(agroup._v_attrs._f_list("user"), ["pq", "qr", "rs"]) + self.assertEqual(agroup._v_attrs._f_list("sys"), []) + self.assertEqual(agroup._v_attrs._f_list("all"), ["pq", "qr", "rs"]) + + atable = self.root.atable + self.assertEqual(atable.attrs._f_list(), ["a", "b", "c"]) + self.assertEqual(atable.attrs._f_list("sys"), []) + self.assertEqual(atable.attrs._f_list("all"), ["a", "b", "c"]) + + anarray = self.root.anarray + self.assertEqual(anarray.attrs._f_list(), ["i", "j", "k"]) + self.assertEqual(anarray.attrs._f_list("sys"), []) + self.assertEqual(anarray.attrs._f_list("all"), ["i", "j", "k"]) + + +class NoSysAttrsNotClose(NoSysAttrsTestCase): + close = False + + +class NoSysAttrsClose(NoSysAttrsTestCase): + close = True + + +class CompatibilityTestCase(common.TestFileMixin, common.PyTablesTestCase): + h5fname = common.test_filename('issue_368.h5') + + @common.unittest.skipIf(Version(np.__version__) < Version('1.9.0'), + 'requires numpy >= 1.9') + def test_pickled_unicode_attrs(self): + # See also gh-368 and https://github.com/numpy/numpy/issues/4879. + # + # This is a compatibility test. In PyTables < 3.0 unicode + # attributes were stored as pickld unicode stings. + # In PyTables >= 3.0 unicode strings are stored as encoded utf-8 + # strings (the utf-8 marker is set at HDF5 level). + # + # In any case PyTables (>= 3.0) should be able to handle correctly + # also data files genetated with older versions of PyTables. + # Unfortunately a bug in numpy < 1.9 + # (https://github.com/numpy/numpy/issues/4879) makes it impossible + # unpickle numpy arrays with dtype "U" resulting in an incorrect + # behaviour of PyTables. + + self.assertEqual( + self.h5file.get_node_attr('/', 'py2_pickled_unicode'), 'abc') + + +class PicklePy2UnpicklePy3TestCase(common.TestFileMixin, + common.PyTablesTestCase): + h5fname = common.test_filename('issue_560.h5') + + def test_pickled_datetime_object(self): + # See also gh-560 + # + # Objects (classes) that are pickled using python 2 may contain + # non-ascii characters in the pickled string. This will cause + # a UnicodeDecodeError when unpickling on python 3. + # Python 3.4 adds encoding='bytes' to fix this + # http://bugs.python.org/issue6784 + # Objects pickled in the testfile have non-ascii chars in the + # picklestring and will throw UnicodeDecodeError when unpickled + # on python 3. + + # datetime will be unpickled with encoding='bytes' + self.assertIsInstance( + self.h5file.get_node_attr('/', 'py2_pickled_datetime'), + datetime.datetime) + # dict will be unpickled with encoding='latin1' + d = self.h5file.get_node_attr('/', 'py2_pickled_dict') + self.assertIsInstance(d, dict) + self.assertEqual(d['s'], 'just a string') + + +class SegFaultPythonTestCase(common.TempFileMixin, common.PyTablesTestCase): + + def test00_segfault(self): + """Checking workaround for Python unpickle problem (see #253).""" + + self.h5file.root._v_attrs.trouble1 = "0" + self.assertEqual(self.h5file.root._v_attrs.trouble1, "0") + self.h5file.root._v_attrs.trouble2 = "0." + self.assertEqual(self.h5file.root._v_attrs.trouble2, "0.") + # Problem happens after reopening + self._reopen() + self.assertEqual(self.h5file.root._v_attrs.trouble1, "0") + self.assertEqual(self.h5file.root._v_attrs.trouble2, "0.") + if common.verbose: + print("Great! '0' and '0.' values can be safely retrieved.") + + +class EmbeddedNullsTestCase(common.TempFileMixin, common.PyTablesTestCase): + # See laso gh-371 (https://github.com/PyTables/PyTables/issues/371) + + def test_unicode(self): + value = "string with a null byte \x00 in it" + + self.h5file.root._v_attrs.name = value + self.assertEqual(self.h5file.root._v_attrs.name, value) + + self._reopen() + + self.assertEqual(self.h5file.root._v_attrs.name, value) + + def test_bytes(self): + value = b"string with a null byte \x00 in it" + + self.h5file.root._v_attrs.name = value + self.assertEqual(self.h5file.root._v_attrs.name, value) + + self._reopen() + + self.assertEqual(self.h5file.root._v_attrs.name, value) + + +class VlenStrAttrTestCase(common.PyTablesTestCase): + def setUp(self): + super().setUp() + self.h5fname = common.test_filename('vlstr_attr.h5') + self.h5file = tb.open_file(self.h5fname) + + def tearDown(self): + self.h5file.close() + super().tearDown() + + def test01_vlen_str_scalar(self): + """Checking file with variable length string attributes.""" + + attr = "vlen_str_scalar" + self.assertEqual( + self.h5file.get_node_attr("/", attr), attr.encode('ascii')) + + def test02_vlen_str_array(self): + """Checking file with variable length string attributes (1d).""" + + attr = "vlen_str_array" + v = self.h5file.get_node_attr('/', attr) + self.assertEqual(v.ndim, 1) + for idx, item in enumerate(v): + value = "%s_%d" % (attr, idx) + self.assertEqual(item, value.encode('ascii')) + + def test03_vlen_str_matrix(self): + """Checking file with variable length string attributes (2d).""" + + attr = "vlen_str_matrix" + m = self.h5file.get_node_attr('/', attr) + self.assertEqual(m.ndim, 2) + for row, rowdata in enumerate(m): + for col, item in enumerate(rowdata): + value = "%s_%d%d" % (attr, row, col) + self.assertEqual(item, value.encode('ascii')) + + +class UnsupportedAttrTypeTestCase(common.TestFileMixin, + common.PyTablesTestCase): + h5fname = common.test_filename('attr-u16.h5') + + def test00_unsupportedType(self): + """Checking file with unsupported type.""" + + self.assertWarns(tb.exceptions.DataTypeWarning, repr, self.h5file) + + +# Test for specific system attributes +class SpecificAttrsTestCase(common.TempFileMixin, common.PyTablesTestCase): + + def test00_earray(self): + """Testing EArray specific attrs (create).""" + + ea = self.h5file.create_earray('/', 'ea', tb.Int32Atom(), (2, 0, 4)) + if common.verbose: + print("EXTDIM-->", ea.attrs.EXTDIM) + self.assertEqual(ea.attrs.EXTDIM, 1) + + def test01_earray(self): + """Testing EArray specific attrs (open).""" + + ea = self.h5file.create_earray('/', 'ea', tb.Int32Atom(), (0, 1, 4)) + self._reopen('r') + ea = self.h5file.root.ea + if common.verbose: + print("EXTDIM-->", ea.attrs.EXTDIM) + self.assertEqual(ea.attrs.EXTDIM, 0) + + +def suite(): + theSuite = common.unittest.TestSuite() + niter = 1 + + for i in range(niter): + theSuite.addTest(common.unittest.makeSuite(NotCloseCreate)) + theSuite.addTest(common.unittest.makeSuite(CloseCreate)) + theSuite.addTest(common.unittest.makeSuite(NoCacheNotCloseCreate)) + theSuite.addTest(common.unittest.makeSuite(NoCacheCloseCreate)) + theSuite.addTest(common.unittest.makeSuite(DictCacheNotCloseCreate)) + theSuite.addTest(common.unittest.makeSuite(DictCacheCloseCreate)) + theSuite.addTest(common.unittest.makeSuite(NotCloseTypesTestCase)) + theSuite.addTest(common.unittest.makeSuite(CloseTypesTestCase)) + theSuite.addTest(common.unittest.makeSuite( + CloseNotAlignedPaddedTypesTestCase)) + theSuite.addTest(common.unittest.makeSuite( + NoCloseAlignedTypesTestCase)) + theSuite.addTest(common.unittest.makeSuite(CloseAlignedTypesTestCase)) + theSuite.addTest(common.unittest.makeSuite( + CloseAlignedPaddedTypesTestCase)) + theSuite.addTest(common.unittest.makeSuite(NoSysAttrsNotClose)) + theSuite.addTest(common.unittest.makeSuite(NoSysAttrsClose)) + theSuite.addTest(common.unittest.makeSuite(CompatibilityTestCase)) + theSuite.addTest(common.unittest.makeSuite( + PicklePy2UnpicklePy3TestCase)) + theSuite.addTest(common.unittest.makeSuite(SegFaultPythonTestCase)) + theSuite.addTest(common.unittest.makeSuite(EmbeddedNullsTestCase)) + theSuite.addTest(common.unittest.makeSuite(VlenStrAttrTestCase)) + theSuite.addTest(common.unittest.makeSuite( + UnsupportedAttrTypeTestCase)) + theSuite.addTest(common.unittest.makeSuite(SpecificAttrsTestCase)) + + return theSuite + + +if __name__ == '__main__': + common.parse_argv(sys.argv) + common.print_versions() + common.unittest.main(defaultTest='suite') diff --git a/tables/tests/test_aux.py b/tables/tests/test_aux.py new file mode 100644 index 0000000..95f34ee --- /dev/null +++ b/tables/tests/test_aux.py @@ -0,0 +1,33 @@ +import unittest +import numpy as np + +import tables as tb + + +class TestAuxiliaryFunctions(unittest.TestCase): + def test_keysort(self): + N = 1000 + rnd = np.random.randint(N, size=N) + for dtype1 in ('S6', 'b1', 'i1', 'i8', 'u4', 'u8', 'f4', 'f8'): + for dtype2 in ('u4', 'i8'): + a = np.array(rnd, dtype1) + b = np.array(rnd, dtype2) + + c = a.copy() + d = c.argsort() + e = c[d] + f = b[d] + + tb.indexesextension.keysort(a, b) + self.assertTrue((a == e).all()) + self.assertTrue((b == f).all()) + + +def suite(): + theSuite = unittest.TestSuite() + theSuite.addTest(unittest.makeSuite(TestAuxiliaryFunctions)) + return theSuite + + +if __name__ == '__main__': + unittest.main(defaultTest='suite') diff --git a/tables/tests/test_backcompat.py b/tables/tests/test_backcompat.py new file mode 100644 index 0000000..ce80444 --- /dev/null +++ b/tables/tests/test_backcompat.py @@ -0,0 +1,223 @@ +import shutil +import tempfile +import warnings +from pathlib import Path + +import numpy as np + +import tables as tb +from tables.tests import common + + +# Check read Tables from pytables version 0.8 +class BackCompatTablesTestCase(common.PyTablesTestCase): + def test01_readTable(self): + """Checking backward compatibility of old formats of tables.""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test01_readTable..." % self.__class__.__name__) + + # Create an instance of an HDF5 Table + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", category=UserWarning) + h5file = tb.open_file(common.test_filename(self.h5fname), "r") + + try: + table = h5file.get_node("/tuple0") + + # Read the 100 records + result = [rec['var2'] for rec in table] + if common.verbose: + print("Nrows in", table._v_pathname, ":", table.nrows) + print("Last record in table ==>", rec) + print("Total selected records in table ==> ", len(result)) + + self.assertEqual(len(result), 100) + finally: + h5file.close() + + +@common.unittest.skipIf(not common.lzo_avail, 'lzo not available') +class Table2_1LZO(BackCompatTablesTestCase): + # pytables 0.8.x versions and after + h5fname = "Table2_1_lzo_nrv2e_shuffle.h5" + + +@common.unittest.skipIf(not common.lzo_avail, 'lzo not available') +class Tables_LZO1(BackCompatTablesTestCase): + h5fname = "Tables_lzo1.h5" # files compressed with LZO1 + + +@common.unittest.skipIf(not common.lzo_avail, 'lzo not available') +class Tables_LZO1_shuffle(BackCompatTablesTestCase): + # files compressed with LZO1 and shuffle + h5fname = "Tables_lzo1_shuffle.h5" + + +@common.unittest.skipIf(not common.lzo_avail, 'lzo not available') +class Tables_LZO2(BackCompatTablesTestCase): + h5fname = "Tables_lzo2.h5" # files compressed with LZO2 + + +@common.unittest.skipIf(not common.lzo_avail, 'lzo not available') +class Tables_LZO2_shuffle(BackCompatTablesTestCase): + # files compressed with LZO2 and shuffle + h5fname = "Tables_lzo2_shuffle.h5" + + +# Check read attributes from PyTables >= 1.0 properly +class BackCompatAttrsTestCase(common.TestFileMixin, common.PyTablesTestCase): + FILENAME = "zerodim-attrs-%s.h5" + + def setUp(self): + self.h5fname = common.test_filename(self.FILENAME % self.format) + super().setUp() + + def test01_readAttr(self): + """Checking backward compatibility of old formats for attributes.""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test01_readAttr..." % self.__class__.__name__) + + # Read old formats + a = self.h5file.get_node("/a") + scalar = np.array(1, dtype="int32") + vector = np.array([1], dtype="int32") + if self.format == "1.3": + self.assertTrue(common.allequal(a.attrs.arrdim1, vector)) + self.assertTrue(common.allequal(a.attrs.arrscalar, scalar)) + self.assertEqual(a.attrs.pythonscalar, 1) + elif self.format == "1.4": + self.assertTrue(common.allequal(a.attrs.arrdim1, vector)) + self.assertTrue(common.allequal(a.attrs.arrscalar, scalar)) + self.assertTrue(common.allequal(a.attrs.pythonscalar, scalar)) + + +class Attrs_1_3(BackCompatAttrsTestCase): + format = "1.3" # pytables 1.0.x versions and earlier + + +class Attrs_1_4(BackCompatAttrsTestCase): + format = "1.4" # pytables 1.1.x versions and later + + +class VLArrayTestCase(common.TestFileMixin, common.PyTablesTestCase): + h5fname = common.test_filename("flavored_vlarrays-format1.6.h5") + + def test01_backCompat(self): + """Checking backward compatibility with old flavors of VLArray.""" + + # Check that we can read the contents without problems (nor warnings!) + vlarray1 = self.h5file.root.vlarray1 + self.assertEqual(vlarray1.flavor, "numeric") + vlarray2 = self.h5file.root.vlarray2 + self.assertEqual(vlarray2.flavor, "python") + self.assertEqual(vlarray2[1], [b'5', b'6', b'77']) + + +# Make sure that 1.x files with TimeXX types continue to be readable +# and that its byteorder is correctly retrieved. +class TimeTestCase(common.TestFileMixin, common.PyTablesTestCase): + # Open a PYTABLES_FORMAT_VERSION=1.x file + h5fname = common.test_filename("time-table-vlarray-1_x.h5") + + def test00_table(self): + """Checking backward compatibility with old TimeXX types (tables).""" + + # Check that we can read the contents without problems (nor warnings!) + table = self.h5file.root.table + self.assertEqual(table.byteorder, "little") + + def test01_vlarray(self): + """Checking backward compatibility with old TimeXX types (vlarrays).""" + + # Check that we can read the contents without problems (nor warnings!) + vlarray4 = self.h5file.root.vlarray4 + self.assertEqual(vlarray4.byteorder, "little") + vlarray8 = self.h5file.root.vlarray4 + self.assertEqual(vlarray8.byteorder, "little") + + +class OldFlavorsTestCase01(common.PyTablesTestCase): + close = False + + # numeric + def test01_open(self): + """Checking opening of (X)Array (old 'numeric' flavor)""" + + # Open the HDF5 with old numeric flavor + h5fname = common.test_filename("oldflavor_numeric.h5") + with tb.open_file(h5fname) as h5file: + + # Assert other properties in array + self.assertEqual(h5file.root.array1.flavor, 'numeric') + self.assertEqual(h5file.root.array2.flavor, 'python') + self.assertEqual(h5file.root.carray1.flavor, 'numeric') + self.assertEqual(h5file.root.carray2.flavor, 'python') + self.assertEqual(h5file.root.vlarray1.flavor, 'numeric') + self.assertEqual(h5file.root.vlarray2.flavor, 'python') + + def test02_copy(self): + """Checking (X)Array.copy() method ('numetic' flavor)""" + + srcfile = common.test_filename("oldflavor_numeric.h5") + tmpfile = tempfile.mktemp(".h5") + shutil.copy(srcfile, tmpfile) + try: + # Open the HDF5 with old numeric flavor + with tb.open_file(tmpfile, "r+") as h5file: + # Copy to another location + self.assertWarns(tb.exceptions.FlavorWarning, + h5file.root.array1.copy, '/', 'array1copy') + h5file.root.array2.copy('/', 'array2copy') + h5file.root.carray1.copy('/', 'carray1copy') + h5file.root.carray2.copy('/', 'carray2copy') + h5file.root.vlarray1.copy('/', 'vlarray1copy') + h5file.root.vlarray2.copy('/', 'vlarray2copy') + + if self.close: + h5file.close() + h5file = tb.open_file(tmpfile) + else: + h5file.flush() + + # Assert other properties in array + self.assertEqual(h5file.root.array1copy.flavor, 'numeric') + self.assertEqual(h5file.root.array2copy.flavor, 'python') + self.assertEqual(h5file.root.carray1copy.flavor, 'numeric') + self.assertEqual(h5file.root.carray2copy.flavor, 'python') + self.assertEqual(h5file.root.vlarray1copy.flavor, 'numeric') + self.assertEqual(h5file.root.vlarray2copy.flavor, 'python') + finally: + Path(tmpfile).unlink() + + +class OldFlavorsTestCase02(common.PyTablesTestCase): + close = True + + +def suite(): + theSuite = common.unittest.TestSuite() + niter = 1 + + for n in range(niter): + theSuite.addTest(common.unittest.makeSuite(VLArrayTestCase)) + theSuite.addTest(common.unittest.makeSuite(TimeTestCase)) + theSuite.addTest(common.unittest.makeSuite(OldFlavorsTestCase01)) + theSuite.addTest(common.unittest.makeSuite(OldFlavorsTestCase02)) + theSuite.addTest(common.unittest.makeSuite(Table2_1LZO)) + theSuite.addTest(common.unittest.makeSuite(Tables_LZO1)) + theSuite.addTest(common.unittest.makeSuite(Tables_LZO1_shuffle)) + theSuite.addTest(common.unittest.makeSuite(Tables_LZO2)) + theSuite.addTest(common.unittest.makeSuite(Tables_LZO2_shuffle)) + + return theSuite + + +if __name__ == '__main__': + import sys + common.parse_argv(sys.argv) + common.print_versions() + common.unittest.main(defaultTest='suite') diff --git a/tables/tests/test_basics.py b/tables/tests/test_basics.py new file mode 100644 index 0000000..d72d880 --- /dev/null +++ b/tables/tests/test_basics.py @@ -0,0 +1,2507 @@ +import os +import sys +import shutil +import platform +import tempfile +import warnings +import threading +import subprocess +import queue +from pathlib import Path + +import tables + +try: + import multiprocessing as mp + multiprocessing_imported = True +except ImportError: + multiprocessing_imported = False + +import numpy as np + +import tables as tb +from tables.tests import common + + +class OpenFileFailureTestCase(common.PyTablesTestCase): + def setUp(self): + super().setUp() + + self.N = len(tb.file._open_files) + self.open_files = tb.file._open_files + + def test01_open_file(self): + """Checking opening of a non existing file.""" + + h5fname = tempfile.mktemp(".h5") + with self.assertRaises(IOError): + h5file = tb.open_file(h5fname) + h5file.close() + + self.assertEqual(self.N, len(self.open_files)) + + def test02_open_file(self): + """Checking opening of an existing non HDF5 file.""" + + # create a dummy file + h5fname = tempfile.mktemp(".h5") + Path(h5fname).write_text('') + + # Try to open the dummy file + try: + with self.assertRaises(tb.HDF5ExtError): + h5file = tb.open_file(h5fname) + h5file.close() + + self.assertEqual(self.N, len(self.open_files)) + finally: + Path(h5fname).unlink() + + def test03_open_file(self): + """Checking opening of an existing file with invalid mode.""" + + # See gh-318 + + # create a dummy file + h5fname = tempfile.mktemp(".h5") + h5file = tb.open_file(h5fname, "w") + h5file.close() + + try: + # Try to open the dummy file + self.assertRaises(ValueError, tb.open_file, h5fname, "ab") + finally: + Path(h5fname).unlink() + + +class OpenFileTestCase(common.TempFileMixin, common.PyTablesTestCase): + + def setUp(self): + super().setUp() + self.populateFile() + + def populateFile(self): + root = self.h5file.root + + # Create an array + self.h5file.create_array(root, 'array', [1, 2], title="Array example") + self.h5file.create_table(root, 'table', {'var1': tb.IntCol()}, + "Table example") + root._v_attrs.testattr = 41 + + # Create another array object + self.h5file.create_array(root, 'anarray', [1], "Array title") + self.h5file.create_table(root, 'atable', {'var1': tb.IntCol()}, + "Table title") + + # Create a group object + group = self.h5file.create_group(root, 'agroup', "Group title") + group._v_attrs.testattr = 42 + + # Create a some objects there + array1 = self.h5file.create_array(group, 'anarray1', + [1, 2, 3, 4, 5, 6, 7], + "Array title 1") + array1.attrs.testattr = 42 + self.h5file.create_array(group, 'anarray2', [2], "Array title 2") + self.h5file.create_table(group, 'atable1', { + 'var1': tb.IntCol()}, "Table title 1") + ra = np.rec.array([(1, 11, 'a')], formats='u1,f4,a1') + self.h5file.create_table(group, 'atable2', ra, "Table title 2") + + # Create a lonely group in first level + self.h5file.create_group(root, 'agroup2', "Group title 2") + + # Create a new group in the second level + group3 = self.h5file.create_group(group, 'agroup3', "Group title 3") + + # Create a new group in the third level + self.h5file.create_group(group3, 'agroup4', "Group title 4") + + # Create an array in the root with the same name as one in 'agroup' + self.h5file.create_array(root, 'anarray1', [1, 2], + title="Array example") + + def test00_newFile(self): + """Checking creation of a new file.""" + + self.h5file.create_array(self.h5file.root, 'array_new', [1, 2], + title="Array example") + + # Get the CLASS attribute of the arr object + class_ = self.h5file.root.array.attrs.CLASS + + self.assertEqual(class_.capitalize(), "Array") + + def test00_newFile_unicode_filename(self): + temp_dir = tempfile.mkdtemp() + try: + h5fname = str(Path(temp_dir) / 'test.h5') + with tb.open_file(h5fname, 'w') as h5file: + self.assertTrue(h5file, tb.File) + finally: + shutil.rmtree(temp_dir) + + def test00_newFile_numpy_str_filename(self): + temp_dir = tempfile.mkdtemp() + try: + h5fname = np.str_(Path(temp_dir) / 'test.h5') + with tb.open_file(h5fname, 'w') as h5file: + self.assertTrue(h5file, tb.File) + finally: + shutil.rmtree(temp_dir) + + def test00_newFile_numpy_unicode_filename(self): + temp_dir = tempfile.mkdtemp() + try: + h5fname = np.unicode_(Path(temp_dir) / 'test.h5') + with tb.open_file(h5fname, 'w') as h5file: + self.assertTrue(h5file, tb.File) + finally: + shutil.rmtree(temp_dir) + + def test01_openFile(self): + """Checking opening of an existing file.""" + + # Open the old HDF5 file + self._reopen(node_cache_slots=self.node_cache_slots) + + # Get the CLASS attribute of the arr object + title = self.h5file.root.array.get_attr("TITLE") + + self.assertEqual(title, "Array example") + + def test01_open_file_pathlib(self): + """Checking opening of an existing file.""" + self.h5file.close() + h5fname = Path(self.h5fname) + with tables.open_file(h5fname) as h5file: + title = h5file.root.array.get_attr("TITLE") + self.assertEqual(title, "Array example") + + def test02_appendFile(self): + """Checking appending objects to an existing file.""" + + # Append a new array to the existing file + self._reopen(mode="r+", node_cache_slots=self.node_cache_slots) + self.h5file.create_array(self.h5file.root, 'array2', [3, 4], + title="Title example 2") + + # Open this file in read-only mode + self._reopen(node_cache_slots=self.node_cache_slots) + + # Get the CLASS attribute of the arr object + title = self.h5file.root.array2.get_attr("TITLE") + + self.assertEqual(title, "Title example 2") + + def test02b_appendFile2(self): + """Checking appending objects to an existing file ("a" version)""" + + # Append a new array to the existing file + self._reopen(mode="a", node_cache_slots=self.node_cache_slots) + self.h5file.create_array(self.h5file.root, 'array2', [3, 4], + title="Title example 2") + + # Open this file in read-only mode + self._reopen(node_cache_slots=self.node_cache_slots) + + # Get the CLASS attribute of the arr object + title = self.h5file.root.array2.get_attr("TITLE") + + self.assertEqual(title, "Title example 2") + + # Begin to raise errors... + + def test03_appendErrorFile(self): + """Checking appending objects to an existing file in "w" mode.""" + + # Append a new array to the existing file but in write mode + # so, the existing file should be deleted! + self._reopen(mode="w", node_cache_slots=self.node_cache_slots) + self.h5file.create_array(self.h5file.root, 'array2', [3, 4], + title="Title example 2") + + # Open this file in read-only mode + self._reopen(node_cache_slots=self.node_cache_slots) + + with self.assertRaises(LookupError): + # Try to get the 'array' object in the old existing file + self.h5file.root.array + + def test04a_openErrorFile(self): + """Checking opening a non-existing file for reading""" + + with self.assertRaises(IOError): + tb.open_file("nonexistent.h5", mode="r", + node_cache_slots=self.node_cache_slots) + + def test04b_alternateRootFile(self): + """Checking alternate root access to the object tree.""" + + # Open the existent HDF5 file + self._reopen(root_uep="/agroup", + node_cache_slots=self.node_cache_slots) + + # Get the CLASS attribute of the arr object + if common.verbose: + print("\nFile tree dump:", self.h5file) + title = self.h5file.root.anarray1.get_attr("TITLE") + + # Get the node again, as this can trigger errors in some situations + anarray1 = self.h5file.root.anarray1 + self.assertIsNotNone(anarray1) + + self.assertEqual(title, "Array title 1") + + # This test works well, but HDF5 emits a series of messages that + # may loose the user. It is better to deactivate it. + def notest04c_alternateRootFile(self): + """Checking non-existent alternate root access to the object tree""" + + with self.assertRaises(RuntimeError): + self._reopen(root_uep="/nonexistent", + node_cache_slots=self.node_cache_slots) + + def test05a_removeGroupRecursively(self): + """Checking removing a group recursively.""" + + # Delete a group with leafs + self._reopen(mode="r+", node_cache_slots=self.node_cache_slots) + + with self.assertRaises(tb.NodeError): + self.h5file.remove_node(self.h5file.root.agroup) + + # This should work now + self.h5file.remove_node(self.h5file.root, 'agroup', recursive=1) + + # Open this file in read-only mode + self._reopen(node_cache_slots=self.node_cache_slots) + + # Try to get the removed object + with self.assertRaises(LookupError): + self.h5file.root.agroup + + # Try to get a child of the removed object + with self.assertRaises(LookupError): + self.h5file.get_node("/agroup/agroup3") + + def test05b_removeGroupRecursively(self): + """Checking removing a group recursively and access to it + immediately.""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test05b_removeGroupRecursively..." % + self.__class__.__name__) + + # Delete a group with leafs + self._reopen(mode="r+", node_cache_slots=self.node_cache_slots) + + with self.assertRaises(tb.NodeError): + self.h5file.remove_node(self.h5file.root, 'agroup') + + # This should work now + self.h5file.remove_node(self.h5file.root, 'agroup', recursive=1) + + # Try to get the removed object + with self.assertRaises(LookupError): + self.h5file.root.agroup + + # Try to get a child of the removed object + with self.assertRaises(LookupError): + self.h5file.get_node("/agroup/agroup3") + + def test06_removeNodeWithDel(self): + """Checking removing a node using ``__delattr__()``""" + + self._reopen(mode="r+", node_cache_slots=self.node_cache_slots) + + with self.assertRaises(AttributeError): + # This should fail because there is no *Python attribute* + # called ``agroup``. + del self.h5file.root.agroup + + def test06a_removeGroup(self): + """Checking removing a lonely group from an existing file.""" + + self._reopen(mode="r+", node_cache_slots=self.node_cache_slots) + + self.h5file.remove_node(self.h5file.root, 'agroup2') + + # Open this file in read-only mode + self._reopen(node_cache_slots=self.node_cache_slots) + + # Try to get the removed object + with self.assertRaises(LookupError): + self.h5file.root.agroup2 + + def test06b_removeLeaf(self): + """Checking removing Leaves from an existing file.""" + + self._reopen(mode="r+", node_cache_slots=self.node_cache_slots) + self.h5file.remove_node(self.h5file.root, 'anarray') + + # Open this file in read-only mode + self._reopen(node_cache_slots=self.node_cache_slots) + + # Try to get the removed object + with self.assertRaises(LookupError): + self.h5file.root.anarray + + def test06c_removeLeaf(self): + """Checking removing Leaves and access it immediately.""" + + self._reopen(mode="r+", node_cache_slots=self.node_cache_slots) + self.h5file.remove_node(self.h5file.root, 'anarray') + + # Try to get the removed object + with self.assertRaises(LookupError): + self.h5file.root.anarray + + def test06d_removeLeaf(self): + """Checking removing a non-existent node""" + + self._reopen(mode="r+", node_cache_slots=self.node_cache_slots) + + # Try to get the removed object + with self.assertRaises(LookupError): + self.h5file.remove_node(self.h5file.root, 'nonexistent') + + def test06e_removeTable(self): + """Checking removing Tables from an existing file.""" + + self._reopen(mode="r+", node_cache_slots=self.node_cache_slots) + self.h5file.remove_node(self.h5file.root, 'atable') + + # Open this file in read-only mode + self._reopen(node_cache_slots=self.node_cache_slots) + + # Try to get the removed object + with self.assertRaises(LookupError): + self.h5file.root.atable + + def test07_renameLeaf(self): + """Checking renaming a leave and access it after a close/open.""" + + self._reopen(mode="r+", node_cache_slots=self.node_cache_slots) + self.h5file.rename_node(self.h5file.root.anarray, 'anarray2') + + # Open this file in read-only mode + self._reopen(node_cache_slots=self.node_cache_slots) + + # Ensure that the new name exists + array_ = self.h5file.root.anarray2 + self.assertEqual(array_.name, "anarray2") + self.assertEqual(array_._v_pathname, "/anarray2") + self.assertEqual(array_._v_depth, 1) + + # Try to get the previous object with the old name + with self.assertRaises(LookupError): + self.h5file.root.anarray + + def test07b_renameLeaf(self): + """Checking renaming Leaves and accesing them immediately.""" + + self._reopen(mode="r+", node_cache_slots=self.node_cache_slots) + self.h5file.rename_node(self.h5file.root.anarray, 'anarray2') + + # Ensure that the new name exists + array_ = self.h5file.root.anarray2 + self.assertEqual(array_.name, "anarray2") + self.assertEqual(array_._v_pathname, "/anarray2") + self.assertEqual(array_._v_depth, 1) + + # Try to get the previous object with the old name + with self.assertRaises(LookupError): + self.h5file.root.anarray + + def test07c_renameLeaf(self): + """Checking renaming Leaves and modify attributes after that.""" + + self._reopen(mode="r+", node_cache_slots=self.node_cache_slots) + self.h5file.rename_node(self.h5file.root.anarray, 'anarray2') + array_ = self.h5file.root.anarray2 + array_.attrs.TITLE = "hello" + + # Ensure that the new attribute has been written correctly + self.assertEqual(array_.title, "hello") + self.assertEqual(array_.attrs.TITLE, "hello") + + def test07d_renameLeaf(self): + """Checking renaming a Group under a nested group.""" + + self._reopen(mode="r+", node_cache_slots=self.node_cache_slots) + self.h5file.rename_node(self.h5file.root.agroup.anarray2, 'anarray3') + + # Ensure that we can access n attributes in the new group + node = self.h5file.root.agroup.anarray3 + self.assertEqual(node._v_title, "Array title 2") + + def test08_renameToExistingLeaf(self): + """Checking renaming a node to an existing name.""" + + self._reopen(mode="r+", node_cache_slots=self.node_cache_slots) + + # Try to get the previous object with the old name + with self.assertRaises(tb.NodeError): + self.h5file.rename_node(self.h5file.root.anarray, 'array') + + # Now overwrite the destination node. + anarray = self.h5file.root.anarray + self.h5file.rename_node(anarray, 'array', overwrite=True) + self.assertNotIn('/anarray', self.h5file) + self.assertIs(self.h5file.root.array, anarray) + + def test08b_renameToNotValidNaturalName(self): + """Checking renaming a node to a non-valid natural name""" + + self._reopen(mode="r+", node_cache_slots=self.node_cache_slots) + + with warnings.catch_warnings(): + warnings.filterwarnings("error", category=tb.NaturalNameWarning) + + # Try to get the previous object with the old name + with self.assertRaises(tb.NaturalNameWarning): + self.h5file.rename_node(self.h5file.root.anarray, 'array 2') + + def test09_renameGroup(self): + """Checking renaming a Group and access it after a close/open.""" + + self._reopen(mode="r+", node_cache_slots=self.node_cache_slots) + self.h5file.rename_node(self.h5file.root.agroup, 'agroup3') + + # Open this file in read-only mode + self._reopen(node_cache_slots=self.node_cache_slots) + + # Ensure that the new name exists + group = self.h5file.root.agroup3 + self.assertEqual(group._v_name, "agroup3") + self.assertEqual(group._v_pathname, "/agroup3") + + # The children of this group also must be accessible through the + # new name path + group2 = self.h5file.get_node("/agroup3/agroup3") + self.assertEqual(group2._v_name, "agroup3") + self.assertEqual(group2._v_pathname, "/agroup3/agroup3") + + # Try to get the previous object with the old name + with self.assertRaises(LookupError): + self.h5file.root.agroup + + # Try to get a child with the old pathname + with self.assertRaises(LookupError): + self.h5file.get_node("/agroup/agroup3") + + def test09b_renameGroup(self): + """Checking renaming a Group and access it immediately.""" + + self._reopen(mode="r+", node_cache_slots=self.node_cache_slots) + self.h5file.rename_node(self.h5file.root.agroup, 'agroup3') + + # Ensure that the new name exists + group = self.h5file.root.agroup3 + self.assertEqual(group._v_name, "agroup3") + self.assertEqual(group._v_pathname, "/agroup3") + + # The children of this group also must be accessible through the + # new name path + group2 = self.h5file.get_node("/agroup3/agroup3") + self.assertEqual(group2._v_name, "agroup3") + self.assertEqual(group2._v_pathname, "/agroup3/agroup3") + + # Try to get the previous object with the old name + with self.assertRaises(LookupError): + self.h5file.root.agroup + + # Try to get a child with the old pathname + with self.assertRaises(LookupError): + self.h5file.get_node("/agroup/agroup3") + + def test09c_renameGroup(self): + """Checking renaming a Group and modify attributes afterwards.""" + + self._reopen(mode="r+", node_cache_slots=self.node_cache_slots) + self.h5file.rename_node(self.h5file.root.agroup, 'agroup3') + + # Ensure that we can modify attributes in the new group + group = self.h5file.root.agroup3 + group._v_attrs.TITLE = "Hello" + self.assertEqual(group._v_title, "Hello") + self.assertEqual(group._v_attrs.TITLE, "Hello") + + def test09d_renameGroup(self): + """Checking renaming a Group under a nested group.""" + + self._reopen(mode="r+", node_cache_slots=self.node_cache_slots) + self.h5file.rename_node(self.h5file.root.agroup.agroup3, 'agroup4') + + # Ensure that we can access n attributes in the new group + group = self.h5file.root.agroup.agroup4 + self.assertEqual(group._v_title, "Group title 3") + + def test09e_renameGroup(self): + """Checking renaming a Group with nested groups in the LRU cache.""" + # This checks for ticket #126. + + self._reopen(mode="r+", node_cache_slots=self.node_cache_slots) + + # Load intermediate groups and keep a nested one alive. + g = self.h5file.root.agroup.agroup3.agroup4 + self.assertIsNotNone(g) + self.h5file.rename_node('/', name='agroup', newname='agroup_') + + # see ticket #126 + self.assertNotIn('/agroup_/agroup4', self.h5file) + + self.assertNotIn('/agroup', self.h5file) + for newpath in ['/agroup_', '/agroup_/agroup3', + '/agroup_/agroup3/agroup4']: + self.assertIn(newpath, self.h5file) + self.assertEqual( + newpath, self.h5file.get_node(newpath)._v_pathname) + + def test10_moveLeaf(self): + """Checking moving a leave and access it after a close/open.""" + + self._reopen(mode="r+", node_cache_slots=self.node_cache_slots) + newgroup = self.h5file.create_group("/", "newgroup") + self.h5file.move_node(self.h5file.root.anarray, newgroup, 'anarray2') + + # Open this file in read-only mode + self._reopen(node_cache_slots=self.node_cache_slots) + + # Ensure that the new name exists + array_ = self.h5file.root.newgroup.anarray2 + self.assertEqual(array_.name, "anarray2") + self.assertEqual(array_._v_pathname, "/newgroup/anarray2") + self.assertEqual(array_._v_depth, 2) + + # Try to get the previous object with the old name + with self.assertRaises(LookupError): + self.h5file.root.anarray + + def test10b_moveLeaf(self): + """Checking moving a leave and access it without a close/open.""" + + self._reopen(mode="r+", node_cache_slots=self.node_cache_slots) + newgroup = self.h5file.create_group("/", "newgroup") + self.h5file.move_node(self.h5file.root.anarray, newgroup, 'anarray2') + + # Ensure that the new name exists + array_ = self.h5file.root.newgroup.anarray2 + self.assertEqual(array_.name, "anarray2") + self.assertEqual(array_._v_pathname, "/newgroup/anarray2") + self.assertEqual(array_._v_depth, 2) + + # Try to get the previous object with the old name + with self.assertRaises(LookupError): + self.h5file.root.anarray + + def test10c_moveLeaf(self): + """Checking moving Leaves and modify attributes after that.""" + + self._reopen(mode="r+", node_cache_slots=self.node_cache_slots) + newgroup = self.h5file.create_group("/", "newgroup") + self.h5file.move_node(self.h5file.root.anarray, newgroup, 'anarray2') + array_ = self.h5file.root.newgroup.anarray2 + array_.attrs.TITLE = "hello" + + # Ensure that the new attribute has been written correctly + self.assertEqual(array_.title, "hello") + self.assertEqual(array_.attrs.TITLE, "hello") + + def test10d_moveToExistingLeaf(self): + """Checking moving a leaf to an existing name.""" + + self._reopen(mode="r+", node_cache_slots=self.node_cache_slots) + + # Try to get the previous object with the old name + with self.assertRaises(tb.NodeError): + self.h5file.move_node( + self.h5file.root.anarray, self.h5file.root, 'array') + + def test10_2_moveTable(self): + """Checking moving a table and access it after a close/open.""" + + self._reopen(mode="r+", node_cache_slots=self.node_cache_slots) + newgroup = self.h5file.create_group("/", "newgroup") + self.h5file.move_node(self.h5file.root.atable, newgroup, 'atable2') + + # Open this file in read-only mode + self._reopen(node_cache_slots=self.node_cache_slots) + + # Ensure that the new name exists + table_ = self.h5file.root.newgroup.atable2 + self.assertEqual(table_.name, "atable2") + self.assertEqual(table_._v_pathname, "/newgroup/atable2") + self.assertEqual(table_._v_depth, 2) + + # Try to get the previous object with the old name + with self.assertRaises(LookupError): + self.h5file.root.atable + + def test10_2b_moveTable(self): + """Checking moving a table and access it without a close/open.""" + + self._reopen(mode="r+", node_cache_slots=self.node_cache_slots) + newgroup = self.h5file.create_group("/", "newgroup") + self.h5file.move_node(self.h5file.root.atable, newgroup, 'atable2') + + # Ensure that the new name exists + table_ = self.h5file.root.newgroup.atable2 + self.assertEqual(table_.name, "atable2") + self.assertEqual(table_._v_pathname, "/newgroup/atable2") + self.assertEqual(table_._v_depth, 2) + + # Try to get the previous object with the old name + with self.assertRaises(LookupError): + self.h5file.root.atable + + def test10_2b_bis_moveTable(self): + """Checking moving a table and use cached row without a close/open.""" + + self._reopen(mode="r+", node_cache_slots=self.node_cache_slots) + newgroup = self.h5file.create_group("/", "newgroup") + + # Cache the Row attribute prior to the move + row = self.h5file.root.atable.row + self.h5file.move_node(self.h5file.root.atable, newgroup, 'atable2') + + # Ensure that the new name exists + table_ = self.h5file.root.newgroup.atable2 + self.assertEqual(table_.name, "atable2") + self.assertEqual(table_._v_pathname, "/newgroup/atable2") + self.assertEqual(table_._v_depth, 2) + + # Ensure that cache Row attribute has been updated + row = table_.row + self.assertEqual(table_._v_pathname, row.table._v_pathname) + nrows = table_.nrows + + # Add a new row just to make sure that this works + row.append() + table_.flush() + self.assertEqual(table_.nrows, nrows + 1) + + def test10_2c_moveTable(self): + """Checking moving tables and modify attributes after that.""" + + self._reopen(mode="r+", node_cache_slots=self.node_cache_slots) + newgroup = self.h5file.create_group("/", "newgroup") + self.h5file.move_node(self.h5file.root.atable, newgroup, 'atable2') + table_ = self.h5file.root.newgroup.atable2 + table_.attrs.TITLE = "hello" + + # Ensure that the new attribute has been written correctly + self.assertEqual(table_.title, "hello") + self.assertEqual(table_.attrs.TITLE, "hello") + + def test10_2d_moveToExistingTable(self): + """Checking moving a table to an existing name.""" + + self._reopen(mode="r+", node_cache_slots=self.node_cache_slots) + + # Try to get the previous object with the old name + with self.assertRaises(tb.NodeError): + self.h5file.move_node(self.h5file.root.atable, self.h5file.root, + 'table') + + def test10_2e_moveToExistingTableOverwrite(self): + """Checking moving a table to an existing name, overwriting it.""" + + self._reopen(mode="r+", node_cache_slots=self.node_cache_slots) + + srcNode = self.h5file.root.atable + self.h5file.move_node(srcNode, self.h5file.root, 'table', + overwrite=True) + dstNode = self.h5file.root.table + + self.assertIs(srcNode, dstNode) + + def test11_moveGroup(self): + """Checking moving a Group and access it after a close/open.""" + + self._reopen(mode="r+", node_cache_slots=self.node_cache_slots) + newgroup = self.h5file.create_group(self.h5file.root, 'newgroup') + self.h5file.move_node(self.h5file.root.agroup, newgroup, 'agroup3') + + # Open this file in read-only mode + self._reopen(node_cache_slots=self.node_cache_slots) + + # Ensure that the new name exists + group = self.h5file.root.newgroup.agroup3 + self.assertEqual(group._v_name, "agroup3") + self.assertEqual(group._v_pathname, "/newgroup/agroup3") + self.assertEqual(group._v_depth, 2) + + # The children of this group must also be accessible through the + # new name path + group2 = self.h5file.get_node("/newgroup/agroup3/agroup3") + self.assertEqual(group2._v_name, "agroup3") + self.assertEqual(group2._v_pathname, "/newgroup/agroup3/agroup3") + self.assertEqual(group2._v_depth, 3) + + # Try to get the previous object with the old name + with self.assertRaises(LookupError): + self.h5file.root.agroup + + # Try to get a child with the old pathname + with self.assertRaises(LookupError): + self.h5file.get_node("/agroup/agroup3") + + def test11b_moveGroup(self): + """Checking moving a Group and access it immediately.""" + + self._reopen(mode="r+", node_cache_slots=self.node_cache_slots) + newgroup = self.h5file.create_group(self.h5file.root, 'newgroup') + self.h5file.move_node(self.h5file.root.agroup, newgroup, 'agroup3') + + # Ensure that the new name exists + group = self.h5file.root.newgroup.agroup3 + self.assertEqual(group._v_name, "agroup3") + self.assertEqual(group._v_pathname, "/newgroup/agroup3") + self.assertEqual(group._v_depth, 2) + + # The children of this group must also be accessible through the + # new name path + group2 = self.h5file.get_node("/newgroup/agroup3/agroup3") + self.assertEqual(group2._v_name, "agroup3") + self.assertEqual(group2._v_pathname, "/newgroup/agroup3/agroup3") + self.assertEqual(group2._v_depth, 3) + + # Try to get the previous object with the old name + with self.assertRaises(LookupError): + self.h5file.root.agroup + + # Try to get a child with the old pathname + with self.assertRaises(LookupError): + self.h5file.get_node("/agroup/agroup3") + + def test11c_moveGroup(self): + """Checking moving a Group and modify attributes afterwards.""" + + self._reopen(mode="r+", node_cache_slots=self.node_cache_slots) + newgroup = self.h5file.create_group(self.h5file.root, 'newgroup') + self.h5file.move_node(self.h5file.root.agroup, newgroup, 'agroup3') + + # Ensure that we can modify attributes in the new group + group = self.h5file.root.newgroup.agroup3 + group._v_attrs.TITLE = "Hello" + group._v_attrs.hola = "Hello" + self.assertEqual(group._v_title, "Hello") + self.assertEqual(group._v_attrs.TITLE, "Hello") + self.assertEqual(group._v_attrs.hola, "Hello") + + def test11d_moveToExistingGroup(self): + """Checking moving a group to an existing name.""" + + self._reopen(mode="r+", node_cache_slots=self.node_cache_slots) + + # Try to get the previous object with the old name + with self.assertRaises(tb.NodeError): + self.h5file.move_node(self.h5file.root.agroup, self.h5file.root, + 'agroup2') + + def test11e_moveToExistingGroupOverwrite(self): + """Checking moving a group to an existing name, overwriting it.""" + + self._reopen(mode="r+", node_cache_slots=self.node_cache_slots) + + # agroup2 -> agroup + srcNode = self.h5file.root.agroup2 + self.h5file.move_node(srcNode, self.h5file.root, 'agroup', + overwrite=True) + dstNode = self.h5file.root.agroup + + self.assertIs(srcNode, dstNode) + + def test12a_moveNodeOverItself(self): + """Checking moving a node over itself.""" + + self._reopen(mode="r+", node_cache_slots=self.node_cache_slots) + + # array -> array + srcNode = self.h5file.root.array + self.h5file.move_node(srcNode, self.h5file.root, 'array') + dstNode = self.h5file.root.array + + self.assertIs(srcNode, dstNode) + + def test12b_moveGroupIntoItself(self): + """Checking moving a group into itself.""" + + self._reopen(mode="r+", node_cache_slots=self.node_cache_slots) + with self.assertRaises(tb.NodeError): + # agroup2 -> agroup2/ + self.h5file.move_node(self.h5file.root.agroup2, + self.h5file.root.agroup2) + + def test13a_copyLeaf(self): + """Copying a leaf.""" + + self._reopen(mode="r+", node_cache_slots=self.node_cache_slots) + + # array => agroup2/ + new_node = self.h5file.copy_node(self.h5file.root.array, + self.h5file.root.agroup2) + dstNode = self.h5file.root.agroup2.array + + self.assertIs(new_node, dstNode) + + def test13b_copyGroup(self): + """Copying a group.""" + + self._reopen(mode="r+", node_cache_slots=self.node_cache_slots) + + # agroup2 => agroup/ + new_node = self.h5file.copy_node(self.h5file.root.agroup2, + self.h5file.root.agroup) + dstNode = self.h5file.root.agroup.agroup2 + + self.assertIs(new_node, dstNode) + + def test13c_copyGroupSelf(self): + """Copying a group into itself.""" + + self._reopen(mode="r+", node_cache_slots=self.node_cache_slots) + + # agroup2 => agroup2/ + new_node = self.h5file.copy_node(self.h5file.root.agroup2, + self.h5file.root.agroup2) + dstNode = self.h5file.root.agroup2.agroup2 + + self.assertIs(new_node, dstNode) + + def test13d_copyGroupRecursive(self): + """Recursively copying a group.""" + + self._reopen(mode="r+", node_cache_slots=self.node_cache_slots) + + # agroup => agroup2/ + new_node = self.h5file.copy_node( + self.h5file.root.agroup, self.h5file.root.agroup2, recursive=True) + dstNode = self.h5file.root.agroup2.agroup + + self.assertIs(new_node, dstNode) + dstChild1 = dstNode.anarray1 + self.assertIsNotNone(dstChild1) + dstChild2 = dstNode.anarray2 + self.assertIsNotNone(dstChild2) + dstChild3 = dstNode.agroup3 + self.assertIsNotNone(dstChild3) + + def test13e_copyRootRecursive(self): + """Recursively copying the root group into the root of another file.""" + + self._reopen(mode="r+", node_cache_slots=self.node_cache_slots) + h5fname2 = tempfile.mktemp(".h5") + h5file2 = tb.open_file( + h5fname2, mode="w", node_cache_slots=self.node_cache_slots) + try: + # h5file.root => h5file2.root + new_node = self.h5file.copy_node( + self.h5file.root, h5file2.root, recursive=True) + dstNode = h5file2.root + + self.assertIs(new_node, dstNode) + self.assertIn("/agroup", h5file2) + self.assertIn("/agroup/anarray1", h5file2) + self.assertIn("/agroup/agroup3", h5file2) + + finally: + h5file2.close() + Path(h5fname2).unlink() + + def test13f_copyRootRecursive(self): + """Recursively copying the root group into a group in another file.""" + + self._reopen(mode="r+", node_cache_slots=self.node_cache_slots) + h5fname2 = tempfile.mktemp(".h5") + h5file2 = tb.open_file( + h5fname2, mode="w", node_cache_slots=self.node_cache_slots) + try: + h5file2.create_group('/', 'agroup2') + + # fileh.root => h5file2.root.agroup2 + new_node = self.h5file.copy_node( + self.h5file.root, h5file2.root.agroup2, recursive=True) + dstNode = h5file2.root.agroup2 + + self.assertIs(new_node, dstNode) + self.assertIn("/agroup2/agroup", h5file2) + self.assertIn("/agroup2/agroup/anarray1", h5file2) + self.assertIn("/agroup2/agroup/agroup3", h5file2) + + finally: + h5file2.close() + Path(h5fname2).unlink() + + def test13g_copyRootItself(self): + """Recursively copying the root group into itself.""" + + self._reopen(mode="r+", node_cache_slots=self.node_cache_slots) + agroup2 = self.h5file.root + self.assertIsNotNone(agroup2) + + # h5file.root => h5file.root + self.assertRaises(IOError, self.h5file.copy_node, + self.h5file.root, self.h5file.root, recursive=True) + + def test14a_copyNodeExisting(self): + """Copying over an existing node.""" + + self._reopen(mode="r+", node_cache_slots=self.node_cache_slots) + + with self.assertRaises(tb.NodeError): + # agroup2 => agroup + self.h5file.copy_node(self.h5file.root.agroup2, newname='agroup') + + def test14b_copyNodeExistingOverwrite(self): + """Copying over an existing node, overwriting it.""" + + self._reopen(mode="r+", node_cache_slots=self.node_cache_slots) + + # agroup2 => agroup + new_node = self.h5file.copy_node(self.h5file.root.agroup2, + newname='agroup', overwrite=True) + dstNode = self.h5file.root.agroup + + self.assertIs(new_node, dstNode) + + def test14b2_copyNodeExistingOverwrite(self): + """Copying over an existing node in other file, overwriting it.""" + + self._reopen(mode="r+", node_cache_slots=self.node_cache_slots) + + h5fname2 = tempfile.mktemp(".h5") + h5file2 = tb.open_file( + h5fname2, mode="w", node_cache_slots=self.node_cache_slots) + + try: + # file1:/anarray1 => h5fname2:/anarray1 + new_node = self.h5file.copy_node(self.h5file.root.agroup.anarray1, + newparent=h5file2.root) + # file1:/ => h5fname2:/ + new_node = self.h5file.copy_node(self.h5file.root, h5file2.root, + overwrite=True, recursive=True) + dstNode = h5file2.root + + self.assertIs(new_node, dstNode) + finally: + h5file2.close() + Path(h5fname2).unlink() + + def test14c_copyNodeExistingSelf(self): + """Copying over self.""" + + self._reopen(mode="r+", node_cache_slots=self.node_cache_slots) + + with self.assertRaises(tb.NodeError): + # agroup => agroup + self.h5file.copy_node(self.h5file.root.agroup, newname='agroup') + + def test14d_copyNodeExistingOverwriteSelf(self): + """Copying over self, trying to overwrite.""" + + self._reopen(mode="r+", node_cache_slots=self.node_cache_slots) + + with self.assertRaises(tb.NodeError): + # agroup => agroup + self.h5file.copy_node( + self.h5file.root.agroup, newname='agroup', overwrite=True) + + def test14e_copyGroupSelfRecursive(self): + """Recursively copying a group into itself.""" + + self._reopen(mode="r+", node_cache_slots=self.node_cache_slots) + + with self.assertRaises(tb.NodeError): + # agroup => agroup/ + self.h5file.copy_node(self.h5file.root.agroup, + self.h5file.root.agroup, recursive=True) + + def test15a_oneStepMove(self): + """Moving and renaming a node in a single action.""" + + self._reopen(mode="r+", node_cache_slots=self.node_cache_slots) + + # anarray1 -> agroup/array + srcNode = self.h5file.root.anarray1 + self.h5file.move_node(srcNode, self.h5file.root.agroup, 'array') + dstNode = self.h5file.root.agroup.array + + self.assertIs(srcNode, dstNode) + + def test15b_oneStepCopy(self): + """Copying and renaming a node in a single action.""" + + self._reopen(mode="r+", node_cache_slots=self.node_cache_slots) + + # anarray1 => agroup/array + new_node = self.h5file.copy_node( + self.h5file.root.anarray1, self.h5file.root.agroup, 'array') + dstNode = self.h5file.root.agroup.array + + self.assertIs(new_node, dstNode) + + def test16a_fullCopy(self): + """Copying full data and user attributes.""" + + self._reopen(mode="r+", node_cache_slots=self.node_cache_slots) + + # agroup => groupcopy + srcNode = self.h5file.root.agroup + new_node = self.h5file.copy_node( + srcNode, newname='groupcopy', recursive=True) + dstNode = self.h5file.root.groupcopy + + self.assertIs(new_node, dstNode) + self.assertEqual(srcNode._v_attrs.testattr, dstNode._v_attrs.testattr) + self.assertEqual( + srcNode.anarray1.attrs.testattr, dstNode.anarray1.attrs.testattr) + self.assertEqual(srcNode.anarray1.read(), dstNode.anarray1.read()) + + def test16b_partialCopy(self): + """Copying partial data and no user attributes.""" + + self._reopen(mode="r+", node_cache_slots=self.node_cache_slots) + + # agroup => groupcopy + srcNode = self.h5file.root.agroup + new_node = self.h5file.copy_node( + srcNode, newname='groupcopy', + recursive=True, copyuserattrs=False, + start=0, stop=5, step=2) + dstNode = self.h5file.root.groupcopy + + self.assertIs(new_node, dstNode) + self.assertFalse(hasattr(dstNode._v_attrs, 'testattr')) + self.assertFalse(hasattr(dstNode.anarray1.attrs, 'testattr')) + self.assertEqual(srcNode.anarray1.read()[ + 0:5:2], dstNode.anarray1.read()) + + def test16c_fullCopy(self): + """Copying full data and user attributes (from file to file).""" + + self._reopen(mode="r+", node_cache_slots=self.node_cache_slots) + + h5fname2 = tempfile.mktemp(".h5") + h5file2 = tb.open_file( + h5fname2, mode="w", node_cache_slots=self.node_cache_slots) + + try: + # file1:/ => h5fname2:groupcopy + srcNode = self.h5file.root + new_node = self.h5file.copy_node( + srcNode, h5file2.root, newname='groupcopy', recursive=True) + dstNode = h5file2.root.groupcopy + + self.assertIs(new_node, dstNode) + self.assertEqual(srcNode._v_attrs.testattr, + dstNode._v_attrs.testattr) + self.assertEqual( + srcNode.agroup.anarray1.attrs.testattr, + dstNode.agroup.anarray1.attrs.testattr) + self.assertEqual(srcNode.agroup.anarray1.read(), + dstNode.agroup.anarray1.read()) + finally: + h5file2.close() + Path(h5fname2).unlink() + + def test17a_CopyChunkshape(self): + """Copying dataset with a chunkshape.""" + + self._reopen(mode="r+", node_cache_slots=self.node_cache_slots) + srcTable = self.h5file.root.table + newTable = self.h5file.copy_node( + srcTable, newname='tablecopy', chunkshape=11) + + self.assertEqual(newTable.chunkshape, (11,)) + self.assertNotEqual(srcTable.chunkshape, newTable.chunkshape) + + def test17b_CopyChunkshape(self): + """Copying dataset with a chunkshape with 'keep' value.""" + + self._reopen(mode="r+", node_cache_slots=self.node_cache_slots) + srcTable = self.h5file.root.table + newTable = self.h5file.copy_node( + srcTable, newname='tablecopy', chunkshape='keep') + + self.assertEqual(srcTable.chunkshape, newTable.chunkshape) + + def test17c_CopyChunkshape(self): + """Copying dataset with a chunkshape with 'auto' value.""" + + self._reopen(mode="r+", node_cache_slots=self.node_cache_slots) + srcTable = self.h5file.root.table + newTable = self.h5file.copy_node( + srcTable, newname='tablecopy', chunkshape=11) + newTable2 = self.h5file.copy_node( + newTable, newname='tablecopy2', chunkshape='auto') + + self.assertEqual(srcTable.chunkshape, newTable2.chunkshape) + + def test18_closedRepr(self): + """Representing a closed node as a string.""" + + self._reopen(node_cache_slots=self.node_cache_slots) + + for node in [self.h5file.root.agroup, self.h5file.root.anarray]: + node._f_close() + self.assertIn('closed', str(node)) + self.assertIn('closed', repr(node)) + + def test19_fileno(self): + """Checking that the 'fileno()' method works.""" + + # Open the old HDF5 file + self._reopen(mode="r", node_cache_slots=self.node_cache_slots) + + # Get the file descriptor for this file + fd = self.h5file.fileno() + if common.verbose: + print("Value of fileno():", fd) + self.assertGreaterEqual(fd, 0) + + +class NodeCacheOpenFile(OpenFileTestCase): + node_cache_slots = tb.parameters.NODE_CACHE_SLOTS + open_kwargs = dict(node_cache_slots=node_cache_slots) + + +class NoNodeCacheOpenFile(OpenFileTestCase): + node_cache_slots = 0 + open_kwargs = dict(node_cache_slots=node_cache_slots) + + +class DictNodeCacheOpenFile(OpenFileTestCase): + node_cache_slots = -tb.parameters.NODE_CACHE_SLOTS + open_kwargs = dict(node_cache_slots=node_cache_slots) + + +class CheckFileTestCase(common.TempFileMixin, common.PyTablesTestCase): + def setUp(self): + super().setUp() + + # Create a regular (text) file + self.txtfile = tempfile.mktemp(".h5") + self.fileh = open(self.txtfile, "w") + self.fileh.write("Hello!") + self.fileh.close() + + def tearDown(self): + self.fileh.close() + Path(self.txtfile).unlink() + super().tearDown() + + def test00_isHDF5File(self): + """Checking tables.is_hdf5_file function (TRUE case)""" + + # Create a PyTables file (and by so, an HDF5 file) + self.h5file.create_array(self.h5file.root, 'array', [1, 2], + title="Title example") + + # For this method to run, it needs a closed file + self.h5file.close() + + # When file has an HDF5 format, always returns 1 + if common.verbose: + print("\nisHDF5File(%s) ==> %d" % ( + self.h5fname, tb.is_hdf5_file(self.h5fname))) + self.assertEqual(tb.is_hdf5_file(self.h5fname), 1) + + def test01_isHDF5File(self): + """Checking tables.is_hdf5_file function (FALSE case)""" + + version = tb.is_hdf5_file(self.txtfile) + + # When file is not an HDF5 format, always returns 0 or + # negative value + self.assertLessEqual(version, 0) + + def test01x_isHDF5File_nonexistent(self): + """Identifying a nonexistent HDF5 file.""" + self.assertRaises(IOError, tb.is_hdf5_file, 'nonexistent') + + @common.unittest.skipUnless(hasattr(os, 'getuid') and os.getuid() != 0, + "no UID") + def test01x_isHDF5File_unreadable(self): + """Identifying an unreadable HDF5 file.""" + + self.h5file.close() + Path(self.h5fname).chmod(0) # no permissions at all + self.assertRaises(IOError, tb.is_hdf5_file, self.h5fname) + + def test02_isPyTablesFile(self): + """Checking is_pytables_file function (TRUE case)""" + + # Create a PyTables h5fname + self.h5file.create_array(self.h5file.root, 'array', + [1, 2], title="Title example") + + # For this method to run, it needs a closed h5fname + self.h5file.close() + + version = tb.is_pytables_file(self.h5fname) + + # When h5fname has a PyTables format, always returns "1.0" string or + # greater + if common.verbose: + print() + print("\nPyTables format version number ==> %s" % version) + self.assertGreaterEqual(version, "1.0") + + def test03_isPyTablesFile(self): + """Checking is_pytables_file function (FALSE case)""" + + version = tb.is_pytables_file(self.txtfile) + + # When file is not a PyTables format, always returns 0 or + # negative value + if common.verbose: + print() + print("\nPyTables format version number ==> %s" % version) + self.assertIsNone(version) + + def test04_openGenericHDF5File(self): + """Checking opening of a generic HDF5 file.""" + + # Open an existing generic HDF5 file + h5fname = common.test_filename("ex-noattr.h5") + with tb.open_file(h5fname, mode="r") as h5file: + # Check for some objects inside + + # A group + columns = h5file.get_node("/columns", classname="Group") + self.assertEqual(columns._v_name, "columns") + + # An Array + array_ = h5file.get_node(columns, "TDC", classname="Array") + self.assertEqual(array_._v_name, "TDC") + + # The new LRU code defers the appearance of a warning to this point + + # Here comes an Array of H5T_ARRAY type + ui = h5file.get_node(columns, "pressure", classname="Array") + self.assertEqual(ui._v_name, "pressure") + if common.verbose: + print("Array object with type H5T_ARRAY -->", repr(ui)) + print("Array contents -->", ui[:]) + + # A Table + table = h5file.get_node("/detector", "table", classname="Table") + self.assertEqual(table._v_name, "table") + + def test04b_UnImplementedOnLoading(self): + """Checking failure loading resulting in an ``UnImplemented`` node.""" + + # ############## Note for developers ############################## + # This test fails if you have the line: # + # ##return ChildClass(self, childname) # uncomment for debugging # + # uncommented in Group.py! # + # ################################################################# + + h5fname = common.test_filename('smpl_unsupptype.h5') + with tb.open_file(h5fname) as h5file: + with self.assertWarns(UserWarning): + node = h5file.get_node('/CompoundChunked') + self.assertIsInstance(node, tb.UnImplemented) + + def test04c_UnImplementedScalar(self): + """Checking opening of HDF5 files containing scalar dataset of + UnImlemented type.""" + + with tb.open_file(common.test_filename("scalar.h5")) as h5file: + with self.assertWarns(UserWarning): + node = h5file.get_node('/variable length string') + self.assertIsInstance(node, tb.UnImplemented) + + def test05_copyUnimplemented(self): + """Checking that an UnImplemented object cannot be copied.""" + + # Open an existing generic HDF5 file + h5fname = common.test_filename("smpl_unsupptype.h5") + with tb.open_file(h5fname, mode="r") as h5file: + self.assertWarns(UserWarning, h5file.get_node, '/CompoundChunked') + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + ui = h5file.get_node('/CompoundChunked') + self.assertEqual(ui._v_name, 'CompoundChunked') + if common.verbose: + print("UnImplement object -->", repr(ui)) + + # Check that it cannot be copied to another file: + self.assertWarns(UserWarning, ui.copy, self.h5file.root, "newui") + + # The next can be used to check the copy of Array objects with H5T_ARRAY + # in the future + def _test05_copyUnimplemented(self): + """Checking that an UnImplemented object cannot be copied.""" + + # Open an existing generic HDF5 file + # We don't need to wrap this in a try clause because + # it has already been tried and the warning will not happen again + h5fname2 = common.test_filename("ex-noattr.h5") + with tb.open_file(h5fname2, mode="r") as h5file2: + # An unsupported object (the deprecated H5T_ARRAY type in + # Array, from pytables 0.8 on) + ui = h5file2.get_node(h5file2.root.columns, "pressure") + self.assertEqual(ui._v_name, "pressure") + if common.verbose: + print("UnImplement object -->", repr(ui)) + + # Check that it cannot be copied to another file + with warnings.catch_warnings(): + # Force the userwarning to issue an error + warnings.filterwarnings("error", category=UserWarning) + with self.assertRaises(UserWarning): + ui.copy(self.h5file.root, "newui") + + +@common.unittest.skipIf(tb.file._FILE_OPEN_POLICY == 'strict', + 'FILE_OPEN_POLICY = "strict"') +class ThreadingTestCase(common.TempFileMixin, common.PyTablesTestCase): + def setUp(self): + super().setUp() + self.h5file.create_carray('/', 'test_array', tb.Int64Atom(), + (200, 300)) + self.h5file.close() + + def test(self): + lock = threading.Lock() + + def syncronized_open_file(*args, **kwargs): + with lock: + return tb.open_file(*args, **kwargs) + + def syncronized_close_file(self, *args, **kwargs): + with lock: + return self.close(*args, **kwargs) + + filename = self.h5fname + + def run(filename, q): + try: + f = syncronized_open_file(filename, mode='r') + arr = f.root.test_array[8:12, 18:22] + assert arr.max() == arr.min() == 0 + syncronized_close_file(f) + except Exception: + q.put(sys.exc_info()) + else: + q.put('OK') + + threads = [] + q = queue.Queue() + for i in range(10): + t = threading.Thread(target=run, args=(filename, q)) + t.start() + threads.append(t) + + for i in range(10): + self.assertEqual(q.get(), 'OK') + + for t in threads: + t.join() + + +class PythonAttrsTestCase(common.TempFileMixin, common.PyTablesTestCase): + """Test interactions of Python attributes and child nodes.""" + + def test00_attrOverChild(self): + """Setting a Python attribute over a child node.""" + + root = self.h5file.root + + # Create ``/test`` and overshadow it with ``root.test``. + child = self.h5file.create_array(root, 'test', [1]) + attr = 'foobar' + self.assertWarns(tb.NaturalNameWarning, setattr, root, 'test', attr) + + self.assertIs(root.test, attr) + self.assertIs(root._f_get_child('test'), child) + + # Now bring ``/test`` again to light. + del root.test + + self.assertIs(root.test, child) + + # Now there is no *attribute* named ``test``. + self.assertRaises(AttributeError, + delattr, root, 'test') + + def test01_childUnderAttr(self): + """Creating a child node under a Python attribute.""" + + h5file = self.h5file + root = h5file.root + + # Create ``root.test`` and an overshadowed ``/test``. + attr = 'foobar' + root.test = attr + self.assertWarns(tb.NaturalNameWarning, + h5file.create_array, root, 'test', [1]) + child = h5file.get_node('/test') + + self.assertIs(root.test, attr) + self.assertIs(root._f_get_child('test'), child) + + # Now bring ``/test`` again to light. + del root.test + + self.assertIs(root.test, child) + + # Now there is no *attribute* named ``test``. + self.assertRaises(AttributeError, delattr, root, 'test') + + def test02_nodeAttrInLeaf(self): + """Assigning a ``Node`` value as an attribute to a ``Leaf``.""" + + h5file = self.h5file + + array1 = h5file.create_array('/', 'array1', [1]) + array2 = h5file.create_array('/', 'array2', [1]) + + # This may make the garbage collector work a little. + array1.array2 = array2 + array2.array1 = array1 + + # Check the assignments. + self.assertIs(array1.array2, array2) + self.assertIs(array2.array1, array1) + self.assertRaises(tb.NoSuchNodeError, # ``/array1`` is not a group + h5file.get_node, '/array1/array2') + self.assertRaises(tb.NoSuchNodeError, # ``/array2`` is not a group + h5file.get_node, '/array2/array3') + + def test03_nodeAttrInGroup(self): + """Assigning a ``Node`` value as an attribute to a ``Group``.""" + + h5file = self.h5file + root = h5file.root + + array = h5file.create_array('/', 'array', [1]) + + # Assign the array to a pair of attributes, + # one of them overshadowing the original. + root.arrayAlias = array + self.assertWarns(tb.NaturalNameWarning, setattr, root, 'array', array) + + # Check the assignments. + self.assertIs(root.arrayAlias, array) + self.assertIs(root.array, array) + self.assertRaises(tb.NoSuchNodeError, h5file.get_node, '/arrayAlias') + self.assertIs(h5file.get_node('/array'), array) + + # Remove the attribute overshadowing the child. + del root.array + + # Now there is no *attribute* named ``array``. + self.assertRaises(AttributeError, delattr, root, 'array') + + +class StateTestCase(common.TempFileMixin, common.PyTablesTestCase): + """Test that ``File`` and ``Node`` operations check their state (open or + closed, readable or writable) before proceeding.""" + + def test00_fileCopyFileClosed(self): + """Test copying a closed file.""" + + self.h5file.close() + h5cfname = tempfile.mktemp(suffix='.h5') + + try: + self.assertRaises(tb.ClosedFileError, + self.h5file.copy_file, h5cfname) + finally: + if Path(h5cfname).is_file(): + Path(h5cfname).unlink() + + def test01_fileCloseClosed(self): + """Test closing an already closed file.""" + + self.h5file.close() + + try: + self.h5file.close() + except tb.ClosedFileError: + self.fail("could not close an already closed file") + + def test02_fileFlushClosed(self): + """Test flushing a closed file.""" + + self.h5file.close() + self.assertRaises(tb.ClosedFileError, self.h5file.flush) + + def test03_fileFlushRO(self): + """Flushing a read-only file.""" + + self._reopen('r') + + try: + self.h5file.flush() + except tb.FileModeError: + self.fail("could not flush a read-only file") + + def test04_fileCreateNodeClosed(self): + """Test creating a node in a closed file.""" + + self.h5file.close() + self.assertRaises(tb.ClosedFileError, + self.h5file.create_group, '/', 'test') + + def test05_fileCreateNodeRO(self): + """Test creating a node in a read-only file.""" + + self._reopen('r') + self.assertRaises(tb.FileModeError, + self.h5file.create_group, '/', 'test') + + def test06_fileRemoveNodeClosed(self): + """Test removing a node from a closed file.""" + + self.h5file.create_group('/', 'test') + self.h5file.close() + self.assertRaises(tb.ClosedFileError, + self.h5file.remove_node, '/', 'test') + + def test07_fileRemoveNodeRO(self): + """Test removing a node from a read-only file.""" + + self.h5file.create_group('/', 'test') + self._reopen('r') + self.assertRaises(tb.FileModeError, + self.h5file.remove_node, '/', 'test') + + def test08_fileMoveNodeClosed(self): + """Test moving a node in a closed file.""" + + self.h5file.create_group('/', 'test1') + self.h5file.create_group('/', 'test2') + self.h5file.close() + self.assertRaises(tb.ClosedFileError, + self.h5file.move_node, '/test1', '/', 'test2') + + def test09_fileMoveNodeRO(self): + """Test moving a node in a read-only file.""" + + self.h5file.create_group('/', 'test1') + self.h5file.create_group('/', 'test2') + self._reopen('r') + self.assertRaises(tb.FileModeError, + self.h5file.move_node, '/test1', '/', 'test2') + + def test10_fileCopyNodeClosed(self): + """Test copying a node in a closed file.""" + + self.h5file.create_group('/', 'test1') + self.h5file.create_group('/', 'test2') + self.h5file.close() + self.assertRaises(tb.ClosedFileError, + self.h5file.copy_node, '/test1', '/', 'test2') + + def test11_fileCopyNodeRO(self): + """Test copying a node in a read-only file.""" + + self.h5file.create_group('/', 'test1') + self._reopen('r') + self.assertRaises(tb.FileModeError, + self.h5file.copy_node, '/test1', '/', 'test2') + + def test13_fileGetNodeClosed(self): + """Test getting a node from a closed file.""" + + self.h5file.create_group('/', 'test') + self.h5file.close() + self.assertRaises(tb.ClosedFileError, self.h5file.get_node, '/test') + + def test14_fileWalkNodesClosed(self): + """Test walking a closed file.""" + + self.h5file.create_group('/', 'test1') + self.h5file.create_group('/', 'test2') + self.h5file.close() + self.assertRaises(tb.ClosedFileError, next, self.h5file.walk_nodes()) + + def test15_fileAttrClosed(self): + """Test setting and deleting a node attribute in a closed file.""" + + self.h5file.create_group('/', 'test') + self.h5file.close() + self.assertRaises(tb.ClosedFileError, + self.h5file.set_node_attr, '/test', 'foo', 'bar') + self.assertRaises(tb.ClosedFileError, + self.h5file.del_node_attr, '/test', 'foo') + + def test16_fileAttrRO(self): + """Test setting and deleting a node attribute in a read-only file.""" + + self.h5file.create_group('/', 'test') + self.h5file.set_node_attr('/test', 'foo', 'foo') + self._reopen('r') + self.assertRaises(tb.FileModeError, + self.h5file.set_node_attr, '/test', 'foo', 'bar') + self.assertRaises(tb.FileModeError, + self.h5file.del_node_attr, '/test', 'foo') + + def test17_fileUndoClosed(self): + """Test undo operations in a closed file.""" + + self.h5file.enable_undo() + self.h5file.create_group('/', 'test2') + self.h5file.close() + self.assertRaises(tb.ClosedFileError, self.h5file.is_undo_enabled) + self.assertRaises(tb.ClosedFileError, self.h5file.get_current_mark) + self.assertRaises(tb.ClosedFileError, self.h5file.undo) + self.assertRaises(tb.ClosedFileError, self.h5file.disable_undo) + + def test18_fileUndoRO(self): + """Test undo operations in a read-only file.""" + + self.h5file.enable_undo() + self.h5file.create_group('/', 'test') + self._reopen('r') + self.assertEqual(self.h5file._undoEnabled, False) + # self.assertRaises(FileModeError, self.h5file.undo) + # self.assertRaises(FileModeError, self.h5file.disable_undo) + + def test19a_getNode(self): + """Test getting a child of a closed node.""" + + g1 = self.h5file.create_group('/', 'g1') + g2 = self.h5file.create_group('/g1', 'g2') + + # Close this *object* so that it should not be used. + g1._f_close() + self.assertRaises(tb.ClosedNodeError, g1._f_get_child, 'g2') + + # Getting a node by its closed object is not allowed. + self.assertRaises(tb.ClosedNodeError, self.h5file.get_node, g1) + + # Going through that *node* should reopen it automatically. + try: + g2_ = self.h5file.get_node('/g1/g2') + except tb.ClosedNodeError: + self.fail("closed parent group has not been reopened") + + # Already open nodes should be closed now, but not the new ones. + self.assertIs(g2._v_isopen, False, + "open child of closed group has not been closed") + self.assertIs(g2_._v_isopen, True, + "open child of closed group has not been closed") + + # And existing closed ones should remain closed, but not the new ones. + g1_ = self.h5file.get_node('/g1') + self.assertIs(g1._v_isopen, False, + "already closed group is not closed anymore") + self.assertIs(g1_._v_isopen, True, + "newly opened group is still closed") + + def test19b_getNode(self): + """Test getting a node that does not start with a slash ('/').""" + + # Create an array in the root + self.h5file.create_array('/', 'array', [1, 2], title="Title example") + + # Get the array without specifying a leading slash + self.assertRaises(NameError, self.h5file.get_node, "array") + + def test20_removeNode(self): + """Test removing a closed node.""" + + # This test is a little redundant once we know that ``File.get_node()`` + # will reload a closed node, but anyway... + + group = self.h5file.create_group('/', 'group') + array = self.h5file.create_array('/group', 'array', [1]) + + # The closed *object* can not be used. + group._f_close() + self.assertRaises(tb.ClosedNodeError, group._f_remove) + self.assertRaises(tb.ClosedNodeError, self.h5file.remove_node, group) + + # Still, the *node* is reloaded when necessary. + try: + self.h5file.remove_node('/group', recursive=True) + except tb.ClosedNodeError: + self.fail("closed node has not been reloaded") + + # Objects of descendent removed nodes + # should have been automatically closed when removed. + self.assertRaises(tb.ClosedNodeError, array._f_remove) + + self.assertNotIn('/group/array', self.h5file) # just in case + self.assertNotIn('/group', self.h5file) # just in case + + def test21_attrsOfNode(self): + """Test manipulating the attributes of a closed node.""" + + node = self.h5file.create_group('/', 'test') + nodeAttrs = node._v_attrs + + nodeAttrs.test = attr = 'foo' + + node._f_close() + self.assertRaises(tb.ClosedNodeError, getattr, node, '_v_attrs') + # The design of ``AttributeSet`` does not yet allow this test. + # self.assertRaises(ClosedNodeError, getattr, nodeAttrs, 'test') + + self.assertEqual(self.h5file.get_node_attr('/test', 'test'), attr) + + def test21b_attrsOfNode(self): + """Test manipulating the attributes of a node in a read-only file.""" + + self.h5file.create_group('/', 'test') + self.h5file.set_node_attr('/test', 'test', 'foo') + + self._reopen('r') + self.assertRaises(tb.FileModeError, + self.h5file.set_node_attr, '/test', 'test', 'bar') + + def test22_fileClosesNode(self): + """Test node closing because of file closing.""" + + node = self.h5file.create_group('/', 'test') + + self.h5file.close() + self.assertRaises(tb.ClosedNodeError, getattr, node, '_v_attrs') + + def test23_reopenFile(self): + """Testing reopening a file and closing it several times.""" + + self.h5file.create_array('/', 'test', [1, 2, 3]) + self.h5file.close() + + with tb.open_file(self.h5fname, "r") as h5file1: + if tb.file._FILE_OPEN_POLICY == 'strict': + self.assertRaises(ValueError, tb.open_file, self.h5fname, "r") + else: + with tb.open_file(self.h5fname, "r") as h5file2: + if common.verbose: + print("(h5file1) test[1]:", h5file1.root.test[1]) + self.assertEqual(h5file1.root.test[1], 2) + h5file1.close() + + if common.verbose: + print("(h5file2) test[1]:", h5file2.root.test[1]) + self.assertEqual(h5file2.root.test[1], 2) + + +class FlavorTestCase(common.TempFileMixin, common.PyTablesTestCase): + """Test that setting, getting and changing the ``flavor`` attribute of a + leaf works as expected.""" + + array_data = np.arange(10) + scalar_data = np.int32(10) + + def _reopen(self, mode='r'): + super()._reopen(mode) + self.array = self.h5file.get_node('/array') + self.scalar = self.h5file.get_node('/scalar') + return True + + def setUp(self): + super().setUp() + self.array = self.h5file.create_array('/', 'array', self.array_data) + self.scalar = self.h5file.create_array('/', 'scalar', self.scalar_data) + + def test00_invalid(self): + """Setting an invalid flavor.""" + + self.assertRaises(tb.FlavorError, setattr, self.array, 'flavor', 'foo') + + def test01_readonly(self): + """Setting a flavor in a read-only file.""" + + self._reopen(mode='r') + self.assertRaises(tb.FileModeError, + setattr, self.array, 'flavor', + tb.flavor.internal_flavor) + + def test02_change(self): + """Changing the flavor and reading data.""" + + for flavor in tb.flavor.all_flavors: + self.array.flavor = flavor + self.assertEqual(self.array.flavor, flavor) + idata = tb.flavor.array_of_flavor(self.array_data, flavor) + odata = self.array[:] + self.assertTrue(common.allequal(odata, idata, flavor)) + + def test03_store(self): + """Storing a changed flavor.""" + + for flavor in tb.flavor.all_flavors: + self.array.flavor = flavor + self.assertEqual(self.array.flavor, flavor) + self._reopen(mode='r+') + self.assertEqual(self.array.flavor, flavor) + + def test04_missing(self): + """Reading a dataset of a missing flavor.""" + + flavor = self.array.flavor # default is internal + self.array._v_attrs.FLAVOR = 'foobar' # breaks flavor + self._reopen(mode='r') + idata = tb.flavor.array_of_flavor(self.array_data, flavor) + with self.assertWarns(tb.FlavorWarning): + odata = self.array.read() + self.assertTrue(common.allequal(odata, idata, flavor)) + + def test05_delete(self): + """Deleting the flavor of a dataset.""" + + self.array.flavor = 'python' # non-default + self.assertEqual(self.array.flavor, 'python') + self.assertEqual(self.array.attrs.FLAVOR, 'python') + del self.array.flavor + self.assertEqual(self.array.flavor, tb.flavor.internal_flavor) + self.assertRaises(AttributeError, getattr, self.array.attrs, 'FLAVOR') + + def test06_copyDeleted(self): + """Copying a node with a deleted flavor (see #100).""" + + snames = [node._v_name for node in [self.array, self.scalar]] + dnames = ['%s_copy' % name for name in snames] + for name in snames: + node = self.h5file.get_node('/', name) + del node.flavor + # Check the copied flavors right after copying and after reopening. + for fmode in ['r+', 'r']: + self._reopen(fmode) + for sname, dname in zip(snames, dnames): + if fmode == 'r+': + snode = self.h5file.get_node('/', sname) + node = snode.copy('/', dname) + elif fmode == 'r': + node = self.h5file.get_node('/', dname) + self.assertEqual(node.flavor, tb.flavor.internal_flavor, + "flavor of node ``%s`` is not internal: %r" + % (node._v_pathname, node.flavor)) + + def test07_restrict_flavors(self): + # regression test for gh-163 + + all_flavors = list(tb.flavor.all_flavors) + alias_map = tb.flavor.alias_map.copy() + converter_map = tb.flavor.converter_map.copy() + identifier_map = tb.flavor.identifier_map.copy() + description_map = tb.flavor.description_map.copy() + + try: + tb.flavor.restrict_flavors(keep=[]) + self.assertLess(len(tb.flavor.alias_map), len(alias_map)) + self.assertLess( + len(tb.flavor.converter_map), + len(converter_map)) + finally: + tb.flavor.all_flavors[:] = all_flavors[:] + tb.flavor.alias_map.update(alias_map) + tb.flavor.converter_map.update(converter_map) + tb.flavor.identifier_map.update(identifier_map) + tb.flavor.description_map.update(description_map) + + +# @common.unittest.skipIf(sys.getfilesystemencoding() != 'utf-8', +# 'need utf-8 file-system encoding') +class UnicodeFilename(common.TempFileMixin, common.PyTablesTestCase): + unicode_prefix = 'para\u0140lel' + + def _getTempFileName(self): + return tempfile.mktemp(prefix=self.unicode_prefix, suffix='.h5') + + def setUp(self): + super().setUp() + + self.test = self.h5file.create_array('/', 'test', [1, 2]) + + # So as to check the reading + self._reopen() + + def test01(self): + """Checking creating a filename with Unicode chars.""" + + test = self.h5file.root.test + if common.verbose: + print("Filename:", self.h5fname) + print("Array:", test[:]) + print("Should look like:", [1, 2]) + self.assertEqual(test[:], [1, 2], "Values does not match.") + + def test02(self): + """Checking tables.is_hdf5_file with a Unicode filename.""" + + self.h5file.close() + if common.verbose: + print("Filename:", self.h5fname) + print(" tables.is_hdf5_file?:", tb.is_hdf5_file(self.h5fname)) + self.assertTrue(tb.is_hdf5_file(self.h5fname)) + + def test03(self): + """Checking is_pytables_file with a Unicode filename.""" + + self.h5file.close() + if common.verbose: + print("Filename:", self.h5fname) + print("is_pytables_file?:", tb.is_pytables_file(self.h5fname)) + self.assertNotEqual(tb.is_pytables_file(self.h5fname), False) + + @staticmethod + def _store_carray(name, data, group): + atom = tb.Atom.from_dtype(data.dtype) + node = tb.CArray(group, name, shape=data.shape, atom=atom) + node[:] = data + + def test_store_and_load_with_non_ascii_attributes(self): + self.h5file.close() + self.h5file = tb.open_file(self.h5fname, "a") + root = self.h5file.root + group = self.h5file.create_group(root, 'face_data') + array_name = 'data at 40\N{DEGREE SIGN}C' + data = np.sinh(np.linspace(-1.4, 1.4, 500)) + with warnings.catch_warnings(): + warnings.simplefilter('ignore', tb.NaturalNameWarning) + self._store_carray(array_name, data, group) + group = self.h5file.create_group(root, 'vertex_data') + + +@common.unittest.skipIf(sys.version_info < (3, 6), + 'PEP 519 was implemented in Python 3.6') +class PathLikeFilename(common.TempFileMixin, common.PyTablesTestCase): + + def _getTempFileName(self): + from pathlib import Path + return Path(tempfile.mktemp(suffix='.h5')) + + def setUp(self): + super().setUp() + + self.test = self.h5file.create_array('/', 'test', [1, 2]) + + # So as to check the reading + self._reopen() + + def test01(self): + """Checking creating a file with a PathLike object as the filename.""" + + test = self.h5file.root.test + if common.verbose: + print("Filename:", self.h5fname) + print("Array:", test[:]) + print("Should look like:", [1, 2]) + self.assertEqual(test[:], [1, 2], "Values does not match.") + + def test02(self): + """Checking tables.is_hdf5_file with a PathLike object as the + filename.""" + + self.h5file.close() + if common.verbose: + print("Filename:", self.h5fname) + print(" tables.is_hdf5_file?:", tb.is_hdf5_file(self.h5fname)) + self.assertTrue(tb.is_hdf5_file(self.h5fname)) + + def test03(self): + """Checking is_pytables_file with a PathLike object as the filename.""" + + self.h5file.close() + if common.verbose: + print("Filename:", self.h5fname) + print("is_pytables_file?:", tb.is_pytables_file(self.h5fname)) + self.assertNotEqual(tb.is_pytables_file(self.h5fname), False) + + def test04_str(self): + str(self.h5file) + + +class FilePropertyTestCase(common.PyTablesTestCase): + def setUp(self): + super().setUp() + self.h5fname = tempfile.mktemp(".h5") + self.h5file = None + + def tearDown(self): + if self.h5file: + self.h5file.close() + + if Path(self.h5fname).is_file(): + Path(self.h5fname).unlink() + super().tearDown() + + def test_get_filesize(self): + data = np.zeros((2000, 2000)) + datasize = np.prod(data.shape) * data.dtype.itemsize + + self.h5file = tb.open_file(self.h5fname, mode="w") + self.h5file.create_array(self.h5file.root, 'array', data) + h5_filesize = self.h5file.get_filesize() + self.h5file.close() + + fs_filesize = Path(self.h5fname).stat().st_size + + self.assertGreaterEqual(h5_filesize, datasize) + self.assertEqual(h5_filesize, fs_filesize) + + def test01_null_userblock_size(self): + self.h5file = tb.open_file(self.h5fname, mode="w") + self.h5file.create_array(self.h5file.root, 'array', [1, 2]) + self.assertEqual(self.h5file.get_userblock_size(), 0) + + def test02_null_userblock_size(self): + self.h5file = tb.open_file(self.h5fname, mode="w") + self.h5file.create_array(self.h5file.root, 'array', [1, 2]) + self.h5file.close() + self.h5file = tb.open_file(self.h5fname, mode="r") + self.assertEqual(self.h5file.get_userblock_size(), 0) + + def test03_null_userblock_size(self): + USER_BLOCK_SIZE = 0 + self.h5file = tb.open_file( + self.h5fname, mode="w", user_block_size=USER_BLOCK_SIZE) + self.h5file.create_array(self.h5file.root, 'array', [1, 2]) + self.assertEqual(self.h5file.get_userblock_size(), 0) + + def test01_userblock_size(self): + USER_BLOCK_SIZE = 512 + self.h5file = tb.open_file( + self.h5fname, mode="w", user_block_size=USER_BLOCK_SIZE) + self.h5file.create_array(self.h5file.root, 'array', [1, 2]) + self.assertEqual(self.h5file.get_userblock_size(), USER_BLOCK_SIZE) + + def test02_userblock_size(self): + USER_BLOCK_SIZE = 512 + self.h5file = tb.open_file( + self.h5fname, mode="w", user_block_size=USER_BLOCK_SIZE) + self.h5file.create_array(self.h5file.root, 'array', [1, 2]) + self.h5file.close() + self.h5file = tb.open_file(self.h5fname, mode="r") + self.assertEqual(self.h5file.get_userblock_size(), USER_BLOCK_SIZE) + + def test_small_userblock_size(self): + USER_BLOCK_SIZE = 12 + self.assertRaises(ValueError, tb.open_file, self.h5fname, mode="w", + user_block_size=USER_BLOCK_SIZE) + + def test_invalid_userblock_size(self): + USER_BLOCK_SIZE = 1025 + self.assertRaises(ValueError, tb.open_file, self.h5fname, mode="w", + user_block_size=USER_BLOCK_SIZE) + + +# Test for reading a file that uses Blosc and created on a big-endian platform +@common.unittest.skipIf(not common.blosc_avail, 'Blosc not available') +class BloscBigEndian(common.TestFileMixin, common.PyTablesTestCase): + h5fname = common.test_filename("blosc_bigendian.h5") + + def test00_bigendian(self): + """Checking compatibility with Blosc on big-endian machines.""" + + # Check that we can read the contents without problems (nor warnings!) + for dset_name in ('i1', 'i2', 'i4', 'i8'): + a = np.arange(10, dtype=dset_name) + dset = self.h5file.get_node('/'+dset_name) + self.assertTrue(common.allequal(a, dset[:]), + "Error in big-endian data!") + + +# Case test for Blosc and subprocesses (via multiprocessing module) + +# The worker function for the subprocess (needs to be here because Windows +# has problems pickling nested functions with the multiprocess module :-/) +def _worker(fn, qout=None): + fp = tb.open_file(fn) + if common.verbose: + print("About to load: ", fn) + rows = fp.root.table.where('(f0 < 10)') + if common.verbose: + print("Got the iterator, about to iterate") + next(rows) + if common.verbose: + print("Succeeded in one iteration\n") + fp.close() + + if qout is not None: + qout.put("Done") + + +# From: Yaroslav Halchenko +# Subject: Skip the unittest on kFreeBSD and Hurd -- locking seems to +# be N/A +# +# on kfreebsd /dev/shm is N/A +# on Hurd -- inter-process semaphore locking is N/A +@common.unittest.skipIf(not multiprocessing_imported, + 'multiprocessing module not available') +@common.unittest.skipIf(platform.system().lower() in ('gnu', 'gnu/kfreebsd'), + "multiprocessing module is not " + "supported on Hurd/kFreeBSD") +@common.unittest.skipIf(not common.blosc_avail, 'Blosc not available') +class BloscSubprocess(common.PyTablesTestCase): + def test_multiprocess(self): + # Create a relatively large table with Blosc level 9 (large blocks) + h5fname = tempfile.mktemp(prefix="multiproc-blosc9-", suffix=".h5") + try: + size = 300_000 + sa = np.fromiter(((i, i**2, i//3) for i in range(size)), + 'i4,i8,f8') + with tb.open_file(h5fname, 'w') as h5file: + h5file.create_table( + h5file.root, 'table', sa, + filters=tb.Filters(complevel=9, complib="blosc"), + chunkshape=(size // 3,)) + + if common.verbose: + print("**** Running from main process:") + _worker(h5fname) + + if common.verbose: + print("**** Running from subprocess:") + + try: + qout = mp.Queue() + except OSError: + print("Permission denied due to /dev/shm settings") + else: + ps = mp.Process(target=_worker, args=(h5fname, qout,)) + ps.daemon = True + ps.start() + + result = qout.get() + if common.verbose: + print(result) + finally: + Path(h5fname).unlink() + + +class HDF5ErrorHandling(common.PyTablesTestCase): + def setUp(self): + super().setUp() + self._old_policy = tb.HDF5ExtError.DEFAULT_H5_BACKTRACE_POLICY + + def tearDown(self): + tb.HDF5ExtError.DEFAULT_H5_BACKTRACE_POLICY = self._old_policy + super().tearDown() + + def test_silence_messages(self): + code = """ +import tables as tb +tb.silence_hdf5_messages(False) +tb.silence_hdf5_messages() +try: + tb.open_file(r'%s') +except tb.HDF5ExtError, e: + pass +""" + + filename = tempfile.mktemp(prefix="hdf5-error-handling-", suffix=".py") + try: + with open(filename, 'w') as fp: + fp.write(code % filename) + + p = subprocess.Popen([sys.executable, filename], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + (stdout, stderr) = p.communicate() + + self.assertNotIn("HDF5-DIAG", stderr.decode('ascii')) + finally: + Path(filename).unlink() + + def test_enable_messages(self): + code = """ +import tables as tb +tb.silence_hdf5_messages() +tb.silence_hdf5_messages(False) +try: + tb.open_file(r'%s') +except tb.HDF5ExtError as e: + pass +""" + + filename = tempfile.mktemp(prefix="hdf5-error-handling-", suffix=".py") + try: + Path(filename).write_text(code % filename) + + p = subprocess.Popen([sys.executable, filename], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + (stdout, stderr) = p.communicate() + + self.assertIn("HDF5-DIAG", stderr.decode('ascii')) + finally: + Path(filename).unlink() + + def _raise_exterror(self): + h5fname = tempfile.mktemp(".h5") + Path(h5fname).write_text('') + + try: + h5file = tb.open_file(h5fname) + h5file.close() + finally: + Path(h5fname).unlink() + + def test_h5_backtrace_quiet(self): + tb.HDF5ExtError.DEFAULT_H5_BACKTRACE_POLICY = True + + with self.assertRaises(tb.HDF5ExtError) as cm: + self._raise_exterror() + + self.assertIsNotNone(cm.exception.h5backtrace) + + def test_h5_backtrace_verbose(self): + tb.HDF5ExtError.DEFAULT_H5_BACKTRACE_POLICY = "VERBOSE" + + with self.assertRaises(tb.HDF5ExtError) as cm: + self._raise_exterror() + + self.assertIsNotNone(cm.exception.h5backtrace) + msg = str(cm.exception) + self.assertIn(cm.exception.h5backtrace[-1][-1], msg) + + def test_h5_backtrace_ignore(self): + tb.HDF5ExtError.DEFAULT_H5_BACKTRACE_POLICY = False + + with self.assertRaises(tb.HDF5ExtError) as cm: + self._raise_exterror() + + self.assertIsNone(cm.exception.h5backtrace) + + +class TestDescription(common.PyTablesTestCase): + def test_isdescription_inheritance(self): + # Regression test for gh-65 + class TestDescParent(tb.IsDescription): + c = tb.Int32Col() + + class TestDesc(TestDescParent): + pass + + self.assertIn('c', TestDesc.columns) + + def test_descr_from_dtype(self): + t = np.dtype([('col1', 'int16'), ('col2', float)]) + descr, byteorder = tb.description.descr_from_dtype(t) + + self.assertIn('col1', descr._v_colobjects) + self.assertIn('col2', descr._v_colobjects) + self.assertEqual(len(descr._v_colobjects), 2) + self.assertIsInstance(descr._v_colobjects['col1'], tb.Col) + self.assertIsInstance(descr._v_colobjects['col2'], tb.Col) + self.assertEqual(descr._v_colobjects['col1'].dtype, np.int16) + self.assertEqual(descr._v_colobjects['col2'].dtype, float) + + def test_descr_from_dtype_rich_dtype(self): + header = [(('timestamp', 't'), 'u4'), + (('unit (cluster) id', 'unit'), 'u2')] + t = np.dtype(header) + + descr, byteorder = tb.description.descr_from_dtype(t) + self.assertEqual(len(descr._v_names), 2) + self.assertEqual(sorted(descr._v_names), ['t', 'unit']) + + def test_descr_from_dtype_comp_01(self): + d1 = np.dtype([('x', 'int16'), ('y', 'int16')]) + d_comp = np.dtype([('time', 'float64'), ('value', d1)]) + + descr, byteorder = tb.description.descr_from_dtype(d_comp) + + self.assertTrue(descr._v_is_nested) + self.assertIn('time', descr._v_colobjects) + self.assertIn('value', descr._v_colobjects) + self.assertEqual(len(descr._v_colobjects), 2) + self.assertIsInstance(descr._v_colobjects['time'], tb.Col) + self.assertTrue(isinstance(descr._v_colobjects['value'], + tb.Description)) + self.assertEqual(descr._v_colobjects['time'].dtype, np.float64) + + def test_descr_from_dtype_comp_02(self): + d1 = np.dtype([('x', 'int16'), ('y', 'int16')]) + + d_comp = np.dtype([('time', 'float64'), ('value', (d1, (1,)))]) + + with self.assertWarns(UserWarning): + descr, byteorder = tb.description.descr_from_dtype(d_comp) + + self.assertTrue(descr._v_is_nested) + self.assertIn('time', descr._v_colobjects) + self.assertIn('value', descr._v_colobjects) + self.assertEqual(len(descr._v_colobjects), 2) + self.assertIsInstance(descr._v_colobjects['time'], tb.Col) + self.assertTrue(isinstance(descr._v_colobjects['value'], + tb.Description)) + self.assertEqual(descr._v_colobjects['time'].dtype, np.float64) + + def test_dtype_from_descr_is_description(self): + # See gh-152 + class TestDescParent(tb.IsDescription): + col1 = tb.Int16Col() + col2 = tb.FloatCol() + + dtype = np.dtype([('col1', 'int16'), ('col2', float)]) + t = tb.description.dtype_from_descr(TestDescParent) + + self.assertEqual(t, dtype) + + def test_dtype_from_descr_is_description_instance(self): + # See gh-152 + class TestDescParent(tb.IsDescription): + col1 = tb.Int16Col() + col2 = tb.FloatCol() + + dtype = np.dtype([('col1', 'int16'), ('col2', float)]) + t = tb.description.dtype_from_descr(TestDescParent()) + + self.assertEqual(t, dtype) + + def test_dtype_from_descr_description_instance(self): + # See gh-152 + class TestDescParent(tb.IsDescription): + col1 = tb.Int16Col() + col2 = tb.FloatCol() + + dtype = np.dtype([('col1', 'int16'), ('col2', float)]) + desctiption = tb.Description(TestDescParent().columns) + t = tb.description.dtype_from_descr(desctiption) + + self.assertEqual(t, dtype) + + def test_dtype_from_descr_dict(self): + # See gh-152 + dtype = np.dtype([('col1', 'int16'), ('col2', float)]) + t = tb.description.dtype_from_descr( + {'col1': tb.Int16Col(), 'col2': tb.FloatCol()}) + + self.assertEqual(t, dtype) + + def test_dtype_from_descr_invalid_type(self): + # See gh-152 + self.assertRaises(ValueError, tb.description.dtype_from_descr, []) + + def test_dtype_from_descr_byteorder(self): + # See gh-152 + class TestDescParent(tb.IsDescription): + col1 = tb.Int16Col() + col2 = tb.FloatCol() + + t = tb.description.dtype_from_descr(TestDescParent, byteorder='>') + + self.assertEqual(t['col1'].byteorder, '>') + self.assertEqual(t['col2'].byteorder, '>') + + def test_str_names(self): + # see gh-42 + d = {'name': tb.Int16Col()} + descr = tb.Description(d) + self.assertEqual(sorted(descr._v_names), sorted(d.keys())) + self.assertIsInstance(descr._v_dtype, np.dtype) + self.assertTrue(sorted(descr._v_dtype.fields), sorted(d.keys())) + + +class TestAtom(common.PyTablesTestCase): + def test_atom_attributes01(self): + shape = (10, 10) + a = tb.Float64Atom(shape=shape) + + self.assertEqual(a.dflt, 0.) + self.assertEqual(a.dtype, np.dtype((np.float64, shape))) + self.assertEqual(a.itemsize, a.dtype.base.itemsize) + self.assertEqual(a.kind, 'float') + self.assertEqual(a.ndim, len(shape)) + # self.assertEqual(a.recarrtype, ) + self.assertEqual(a.shape, shape) + self.assertEqual(a.size, a.itemsize * np.prod(shape)) + self.assertEqual(a.type, 'float64') + + def test_atom_copy01(self): + shape = (10, 10) + a = tb.Float64Atom(shape=shape) + aa = a.copy() + self.assertEqual(aa.shape, shape) + + def test_atom_copy02(self): + dflt = 2.0 + a = tb.Float64Atom(dflt=dflt) + aa = a.copy() + self.assertEqual(aa.dflt, dflt) + + def test_atom_copy_override(self): + shape = (10, 10) + dflt = 2.0 + a = tb.Float64Atom(shape=shape, dflt=dflt) + aa = a.copy(dflt=-dflt) + self.assertEqual(aa.shape, shape) + self.assertNotEqual(aa.dflt, dflt) + self.assertEqual(aa.dflt, -dflt) + + +class TestCol(common.PyTablesTestCase): + def test_col_copy01(self): + shape = (10, 10) + c = tb.Float64Col(shape=shape) + cc = c.copy() + self.assertEqual(cc.shape, shape) + + def test_col_copy02(self): + dflt = 2.0 + c = tb.Float64Col(dflt=dflt) + cc = c.copy() + self.assertEqual(cc.dflt, dflt) + + def test_col_copy_override(self): + shape = (10, 10) + dflt = 2.0 + pos = 3 + c = tb.Float64Col(shape=shape, dflt=dflt, pos=pos) + cc = c.copy(pos=2) + self.assertEqual(cc.shape, shape) + self.assertEqual(cc.dflt, dflt) + self.assertNotEqual(cc._v_pos, pos) + self.assertEqual(cc._v_pos, 2) + + +class TestSysattrCompatibility(common.PyTablesTestCase): + def test_open_python2(self): + h5fname = common.test_filename("python2.h5") + with tb.open_file(h5fname, "r") as h5file: + self.assertTrue(h5file.isopen) + + def test_open_python3(self): + h5fname = common.test_filename("python3.h5") + with tb.open_file(h5fname, "r") as h5file: + self.assertTrue(h5file.isopen) + + +def suite(): + theSuite = common.unittest.TestSuite() + niter = 1 + + for i in range(niter): + theSuite.addTest(common.unittest.makeSuite(OpenFileFailureTestCase)) + theSuite.addTest(common.unittest.makeSuite(NodeCacheOpenFile)) + theSuite.addTest(common.unittest.makeSuite(NoNodeCacheOpenFile)) + theSuite.addTest(common.unittest.makeSuite(DictNodeCacheOpenFile)) + theSuite.addTest(common.unittest.makeSuite(CheckFileTestCase)) + theSuite.addTest(common.unittest.makeSuite(ThreadingTestCase)) + theSuite.addTest(common.unittest.makeSuite(PythonAttrsTestCase)) + theSuite.addTest(common.unittest.makeSuite(StateTestCase)) + theSuite.addTest(common.unittest.makeSuite(FlavorTestCase)) + theSuite.addTest(common.unittest.makeSuite(UnicodeFilename)) + theSuite.addTest(common.unittest.makeSuite(PathLikeFilename)) + theSuite.addTest(common.unittest.makeSuite(FilePropertyTestCase)) + theSuite.addTest(common.unittest.makeSuite(BloscBigEndian)) + theSuite.addTest(common.unittest.makeSuite(BloscSubprocess)) + theSuite.addTest(common.unittest.makeSuite(HDF5ErrorHandling)) + theSuite.addTest(common.unittest.makeSuite(TestDescription)) + theSuite.addTest(common.unittest.makeSuite(TestAtom)) + theSuite.addTest(common.unittest.makeSuite(TestCol)) + theSuite.addTest(common.unittest.makeSuite(TestSysattrCompatibility)) + + return theSuite + + +if __name__ == '__main__': + common.parse_argv(sys.argv) + common.print_versions() + common.unittest.main(defaultTest='suite') diff --git a/tables/tests/test_carray.py b/tables/tests/test_carray.py new file mode 100644 index 0000000..5faf29f --- /dev/null +++ b/tables/tests/test_carray.py @@ -0,0 +1,2836 @@ +import sys +from pathlib import Path + +import numpy as np + +import tables as tb +from tables.tests import common + + +class BasicTestCase(common.TempFileMixin, common.PyTablesTestCase): + # Default values + obj = None + flavor = "numpy" + type = 'int32' + shape = (2, 2) + start = 0 + stop = 10 + step = 1 + length = 1 + chunkshape = (5, 5) + compress = 0 + complib = "zlib" # Default compression library + shuffle = 0 + bitshuffle = 0 + fletcher32 = 0 + reopen = 1 # Tells whether the file has to be reopened on each test or not + + def setUp(self): + super().setUp() + # Create an instance of an HDF5 Table + self.rootgroup = self.h5file.root + self.populateFile() + if self.reopen: + # Close the file + self.h5file.close() + + def populateFile(self): + group = self.rootgroup + obj = self.obj + if obj is None: + if self.type == "string": + atom = tb.StringAtom(itemsize=self.length) + else: + atom = tb.Atom.from_type(self.type) + else: + atom = None + title = self.__class__.__name__ + filters = tb.Filters(complevel=self.compress, + complib=self.complib, + shuffle=self.shuffle, + bitshuffle=self.bitshuffle, + fletcher32=self.fletcher32) + carray = self.h5file.create_carray(group, 'carray1', + atom=atom, shape=self.shape, + title=title, filters=filters, + chunkshape=self.chunkshape, obj=obj) + carray.flavor = self.flavor + + # Fill it with data + self.rowshape = list(carray.shape) + self.objsize = self.length * np.prod(carray.shape) + + if self.flavor == "numpy": + if self.type == "string": + object = np.ndarray(buffer=b"a"*self.objsize, + shape=self.shape, + dtype="S%s" % carray.atom.itemsize) + else: + object = np.arange(self.objsize, dtype=carray.atom.dtype) + object.shape = carray.shape + if common.verbose: + print("Object to append -->", repr(object)) + + carray[...] = object + + def _get_shape(self): + if self.shape is not None: + shape = self.shape + else: + shape = np.asarray(self.obj).shape + + return shape + + def test00_attributes(self): + if self.reopen: + self.h5file = tb.open_file(self.h5fname, "r") + obj = self.h5file.get_node("/carray1") + + shape = self._get_shape() + + self.assertEqual(obj.flavor, self.flavor) + self.assertEqual(obj.shape, shape) + self.assertEqual(obj.ndim, len(shape)) + self.assertEqual(obj.chunkshape, self.chunkshape) + self.assertEqual(obj.nrows, shape[0]) + self.assertEqual(obj.atom.type, self.type) + + def test01_readCArray(self): + """Checking read() of chunked layout arrays.""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test01_readCArray..." % self.__class__.__name__) + + # Create an instance of an HDF5 Table + if self.reopen: + self.h5file = tb.open_file(self.h5fname, "r") + carray = self.h5file.get_node("/carray1") + + # Choose a small value for buffer size + carray.nrowsinbuf = 3 + if common.verbose: + print("CArray descr:", repr(carray)) + print("shape of read array ==>", carray.shape) + print("reopening?:", self.reopen) + + shape = self._get_shape() + + # Build the array to do comparisons + if self.flavor == "numpy": + if self.type == "string": + object_ = np.ndarray(buffer=b"a" * self.objsize, + shape=self.shape, + dtype=f"S{carray.atom.itemsize}") + else: + object_ = np.arange(self.objsize, dtype=carray.atom.dtype) + object_.shape = shape + + stop = self.stop + # stop == None means read only the element designed by start + # (in read() contexts) + if self.stop is None: + if self.start == -1: # corner case + stop = carray.nrows + else: + stop = self.start + 1 + # Protection against number of elements less than existing + # if rowshape[self.extdim] < self.stop or self.stop == 0: + if carray.nrows < stop: + # self.stop == 0 means last row only in read() + # and not in [::] slicing notation + stop = int(carray.nrows) + # do a copy() in order to ensure that len(object._data) + # actually do a measure of its length + obj = object_[self.start:stop:self.step].copy() + + # Read all the array + try: + data = carray.read(self.start, stop, self.step) + except IndexError: + if self.flavor == "numpy": + data = np.empty(shape=self.shape, dtype=self.type) + else: + data = np.empty(shape=self.shape, dtype=self.type) + + if common.verbose: + if hasattr(obj, "shape"): + print("shape should look as:", obj.shape) + print("Object read ==>", repr(data)) + print("Should look like ==>", repr(obj)) + + if hasattr(data, "shape"): + self.assertEqual(len(data.shape), len(shape)) + else: + # Scalar case + self.assertEqual(len(self.shape), 1) + self.assertEqual(carray.chunkshape, self.chunkshape) + self.assertTrue(common.allequal(data, obj, self.flavor)) + + def test01_readCArray_out_argument(self): + """Checking read() of chunked layout arrays.""" + + # Create an instance of an HDF5 Table + if self.reopen: + self.h5file = tb.open_file(self.h5fname, "r") + carray = self.h5file.get_node("/carray1") + + shape = self._get_shape() + + # Choose a small value for buffer size + carray.nrowsinbuf = 3 + # Build the array to do comparisons + if self.flavor == "numpy": + if self.type == "string": + object_ = np.ndarray(buffer=b"a" * self.objsize, + shape=self.shape, + dtype=f"S{carray.atom.itemsize}") + else: + object_ = np.arange(self.objsize, dtype=carray.atom.dtype) + object_.shape = shape + + stop = self.stop + # stop == None means read only the element designed by start + # (in read() contexts) + if self.stop is None: + if self.start == -1: # corner case + stop = carray.nrows + else: + stop = self.start + 1 + # Protection against number of elements less than existing + # if rowshape[self.extdim] < self.stop or self.stop == 0: + if carray.nrows < stop: + # self.stop == 0 means last row only in read() + # and not in [::] slicing notation + stop = int(carray.nrows) + # do a copy() in order to ensure that len(object._data) + # actually do a measure of its length + obj = object_[self.start:stop:self.step].copy() + + # Read all the array + try: + data = np.empty(shape, dtype=carray.atom.dtype) + data = data[self.start:stop:self.step].copy() + carray.read(self.start, stop, self.step, out=data) + except IndexError: + if self.flavor == "numpy": + data = np.empty(shape=shape, dtype=self.type) + else: + data = np.empty(shape=shape, dtype=self.type) + + if hasattr(data, "shape"): + self.assertEqual(len(data.shape), len(shape)) + else: + # Scalar case + self.assertEqual(len(shape), 1) + self.assertEqual(carray.chunkshape, self.chunkshape) + self.assertTrue(common.allequal(data, obj, self.flavor)) + + def test02_getitemCArray(self): + """Checking chunked layout array __getitem__ special method.""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test02_getitemCArray..." % + self.__class__.__name__) + + if not hasattr(self, "slices"): + # If there is not a slices attribute, create it + self.slices = (slice(self.start, self.stop, self.step),) + + # Create an instance of an HDF5 Table + if self.reopen: + self.h5file = tb.open_file(self.h5fname, "r") + carray = self.h5file.get_node("/carray1") + + if common.verbose: + print("CArray descr:", repr(carray)) + print("shape of read array ==>", carray.shape) + print("reopening?:", self.reopen) + + shape = self._get_shape() + + # Build the array to do comparisons + if self.type == "string": + object_ = np.ndarray(buffer=b"a" * self.objsize, + shape=self.shape, + dtype=f"S{carray.atom.itemsize}") + else: + object_ = np.arange(self.objsize, dtype=carray.atom.dtype) + object_.shape = shape + + # do a copy() in order to ensure that len(object._data) + # actually do a measure of its length + obj = object_.__getitem__(self.slices).copy() + + # Read data from the array + try: + data = carray.__getitem__(self.slices) + except IndexError: + print("IndexError!") + if self.flavor == "numpy": + data = np.empty(shape=self.shape, dtype=self.type) + else: + data = np.empty(shape=self.shape, dtype=self.type) + + if common.verbose: + print("Object read:\n", repr(data)) # , data.info() + print("Should look like:\n", repr(obj)) # , object.info() + if hasattr(obj, "shape"): + print("Original object shape:", self.shape) + print("Shape read:", data.shape) + print("shape should look as:", obj.shape) + + if not hasattr(data, "shape"): + # Scalar case + self.assertEqual(len(self.shape), 1) + self.assertEqual(carray.chunkshape, self.chunkshape) + self.assertTrue(common.allequal(data, obj, self.flavor)) + + def test03_setitemCArray(self): + """Checking chunked layout array __setitem__ special method.""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test03_setitemCArray..." % + self.__class__.__name__) + + if not hasattr(self, "slices"): + # If there is not a slices attribute, create it + self.slices = (slice(self.start, self.stop, self.step),) + + # Create an instance of an HDF5 Table + if self.reopen: + self.h5file = tb.open_file(self.h5fname, "a") + carray = self.h5file.get_node("/carray1") + + if common.verbose: + print("CArray descr:", repr(carray)) + print("shape of read array ==>", carray.shape) + print("reopening?:", self.reopen) + + shape = self._get_shape() + + # Build the array to do comparisons + if self.type == "string": + object_ = np.ndarray(buffer=b"a" * self.objsize, + shape=self.shape, + dtype=f"S{carray.atom.itemsize}") + else: + object_ = np.arange(self.objsize, dtype=carray.atom.dtype) + object_.shape = shape + + # do a copy() in order to ensure that len(object._data) + # actually do a measure of its length + obj = object_.__getitem__(self.slices).copy() + + if self.type == "string": + if hasattr(self, "wslice"): + obj[self.wslize] = "xXx" + carray[self.wslice] = "xXx" + elif sum(obj[self.slices].shape) != 0: + obj[:] = "xXx" + if obj.size > 0: + carray[self.slices] = obj + else: + if hasattr(self, "wslice"): + obj[self.wslice] = obj[self.wslice] * 2 + 3 + carray[self.wslice] = carray[self.wslice] * 2 + 3 + elif sum(obj[self.slices].shape) != 0: + obj = obj * 2 + 3 + if np.prod(obj.shape) > 0: + carray[self.slices] = carray[self.slices] * 2 + 3 + # Cast again object to its original type + obj = np.array(obj, dtype=carray.atom.dtype) + # Read datafrom the array + try: + data = carray.__getitem__(self.slices) + except IndexError: + print("IndexError!") + if self.flavor == "numpy": + data = np.empty(shape=self.shape, dtype=self.type) + else: + data = np.empty(shape=self.shape, dtype=self.type) + + if common.verbose: + print("Object read:\n", repr(data)) # , data.info() + print("Should look like:\n", repr(obj)) # , object.info() + if hasattr(obj, "shape"): + print("Original object shape:", self.shape) + print("Shape read:", data.shape) + print("shape should look as:", obj.shape) + + if not hasattr(data, "shape"): + # Scalar case + self.assertEqual(len(self.shape), 1) + self.assertEqual(carray.chunkshape, self.chunkshape) + self.assertTrue(common.allequal(data, obj, self.flavor)) + + +class BasicWriteTestCase(BasicTestCase): + type = 'int32' + shape = (2,) + chunkshape = (5,) + step = 1 + wslice = 1 # single element case + + +class BasicWrite2TestCase(BasicTestCase): + type = 'int32' + shape = (2,) + chunkshape = (5,) + step = 1 + wslice = slice(shape[0]-2, shape[0], 2) # range of elements + reopen = 0 # This case does not reopen files + + +class BasicWrite3TestCase(BasicTestCase): + obj = [1, 2] + type = np.asarray(obj).dtype.name + shape = None + chunkshape = (5,) + step = 1 + reopen = 0 # This case does not reopen files + + +class BasicWrite4TestCase(BasicTestCase): + obj = np.array([1, 2]) + type = obj.dtype.name + shape = None + chunkshape = (5,) + step = 1 + reopen = 0 # This case does not reopen files + + +class BasicWrite5TestCase(BasicTestCase): + obj = [[1, 2], [3, 4]] + type = np.asarray(obj).dtype.name + shape = None + chunkshape = (5, 1) + step = 1 + reopen = 0 # This case does not reopen files + + +class BasicWrite6TestCase(BasicTestCase): + obj = [1, 2] + type = np.asarray(obj).dtype.name + shape = None + chunkshape = (5,) + step = 1 + reopen = 1 # This case does reopen files + + +class BasicWrite7TestCase(BasicTestCase): + obj = np.array([1, 2]) + type = obj.dtype.name + shape = None + chunkshape = (5,) + step = 1 + reopen = 1 # This case does reopen files + + +class BasicWrite8TestCase(BasicTestCase): + obj = [[1, 2], [3, 4]] + type = np.asarray(obj).dtype.name + shape = None + chunkshape = (5, 1) + step = 1 + reopen = 1 # This case does reopen files + + +class EmptyCArrayTestCase(BasicTestCase): + type = 'int32' + shape = (2, 2) + chunkshape = (5, 5) + start = 0 + stop = 10 + step = 1 + + +class EmptyCArray2TestCase(BasicTestCase): + type = 'int32' + shape = (2, 2) + chunkshape = (5, 5) + start = 0 + stop = 10 + step = 1 + reopen = 0 # This case does not reopen files + + +@common.unittest.skipIf(not common.lzo_avail, + 'LZO compression library not available') +class SlicesCArrayTestCase(BasicTestCase): + compress = 1 + complib = "lzo" + type = 'int32' + shape = (2, 2) + chunkshape = (5, 5) + slices = (slice(1, 2, 1), slice(1, 3, 1)) + + +class EllipsisCArrayTestCase(BasicTestCase): + type = 'int32' + shape = (2, 2) + chunkshape = (5, 5) + # slices = (slice(1,2,1), Ellipsis) + slices = (Ellipsis, slice(1, 2, 1)) + + +@common.unittest.skipIf(not common.lzo_avail, + 'LZO compression library not available') +class Slices2CArrayTestCase(BasicTestCase): + compress = 1 + complib = "lzo" + type = 'int32' + shape = (2, 2, 4) + chunkshape = (5, 5, 5) + slices = (slice(1, 2, 1), slice(None, None, None), slice(1, 4, 2)) + + +class Ellipsis2CArrayTestCase(BasicTestCase): + type = 'int32' + shape = (2, 2, 4) + chunkshape = (5, 5, 5) + slices = (slice(1, 2, 1), Ellipsis, slice(1, 4, 2)) + + +@common.unittest.skipIf(not common.lzo_avail, + 'LZO compression library not available') +class Slices3CArrayTestCase(BasicTestCase): + compress = 1 # To show the chunks id DEBUG is on + complib = "lzo" + type = 'int32' + shape = (2, 3, 4, 2) + chunkshape = (5, 5, 5, 5) + slices = (slice(1, 2, 1), slice( + 0, None, None), slice(1, 4, 2)) # Don't work + # slices = (slice(None, None, None), slice(0, None, None), + # slice(1,4,1)) # W + # slices = (slice(None, None, None), slice(None, None, None), + # slice(1,4,2)) # N + # slices = (slice(1,2,1), slice(None, None, None), slice(1,4,2)) # N + # Disable the failing test temporarily with a working test case + slices = (slice(1, 2, 1), slice(1, 4, None), slice(1, 4, 2)) # Y + # slices = (slice(1,2,1), slice(0, 4, None), slice(1,4,1)) # Y + slices = (slice(1, 2, 1), slice(0, 4, None), slice(1, 4, 2)) # N + # slices = (slice(1,2,1), slice(0, 4, None), slice(1,4,2), + # slice(0,100,1)) # N + + +class Slices4CArrayTestCase(BasicTestCase): + type = 'int32' + shape = (2, 3, 4, 2, 5, 6) + chunkshape = (5, 5, 5, 5, 5, 5) + slices = (slice(1, 2, 1), slice(0, None, None), slice(1, 4, 2), + slice(0, 4, 2), slice(3, 5, 2), slice(2, 7, 1)) + + +class Ellipsis3CArrayTestCase(BasicTestCase): + type = 'int32' + shape = (2, 3, 4, 2) + chunkshape = (5, 5, 5, 5) + slices = (Ellipsis, slice(0, 4, None), slice(1, 4, 2)) + slices = (slice(1, 2, 1), slice(0, 4, None), slice(1, 4, 2), Ellipsis) + + +class Ellipsis4CArrayTestCase(BasicTestCase): + type = 'int32' + shape = (2, 3, 4, 5) + chunkshape = (5, 5, 5, 5) + slices = (Ellipsis, slice(0, 4, None), slice(1, 4, 2)) + slices = (slice(1, 2, 1), Ellipsis, slice(1, 4, 2)) + + +class Ellipsis5CArrayTestCase(BasicTestCase): + type = 'int32' + shape = (2, 3, 4, 5) + chunkshape = (5, 5, 5, 5) + slices = (slice(1, 2, 1), slice(0, 4, None), Ellipsis) + + +class Ellipsis6CArrayTestCase(BasicTestCase): + type = 'int32' + shape = (2, 3, 4, 5) + chunkshape = (5, 5, 5, 5) + # The next slices gives problems with setting values (test03) + # This is a problem on the test design, not the Array.__setitem__ + # code, though. See # see test_earray.py Ellipsis6EArrayTestCase + slices = (slice(1, 2, 1), slice(0, 4, None), 2, Ellipsis) + + +class Ellipsis7CArrayTestCase(BasicTestCase): + type = 'int32' + shape = (2, 3, 4, 5) + chunkshape = (5, 5, 5, 5) + slices = (slice(1, 2, 1), slice(0, 4, None), slice(2, 3), Ellipsis) + + +class MD3WriteTestCase(BasicTestCase): + type = 'int32' + shape = (2, 2, 3) + chunkshape = (4, 4, 4) + step = 2 + + +class MD5WriteTestCase(BasicTestCase): + type = 'int32' + shape = (2, 2, 3, 4, 5) # ok + # shape = (1, 1, 2, 1) # Minimum shape that shows problems with HDF5 1.6.1 + # shape = (2, 3, 2, 4, 5) # Floating point exception (HDF5 1.6.1) + # shape = (2, 3, 3, 2, 5, 6) # Segmentation fault (HDF5 1.6.1) + chunkshape = (1, 1, 1, 1, 1) + start = 1 + stop = 10 + step = 10 + + +class MD6WriteTestCase(BasicTestCase): + type = 'int32' + shape = (2, 3, 3, 2, 5, 6) + chunkshape = (1, 1, 1, 1, 5, 6) + start = 1 + stop = 10 + step = 3 + + +class MD6WriteTestCase__(BasicTestCase): + type = 'int32' + shape = (2, 2) + chunkshape = (1, 1) + start = 1 + stop = 3 + step = 1 + + +class MD7WriteTestCase(BasicTestCase): + type = 'int32' + shape = (2, 3, 3, 4, 5, 2, 3) + chunkshape = (10, 10, 10, 10, 10, 10, 10) + start = 1 + stop = 10 + step = 2 + + +class MD10WriteTestCase(BasicTestCase): + type = 'int32' + shape = (1, 2, 3, 4, 5, 5, 4, 3, 2, 2) + chunkshape = (5, 5, 5, 5, 5, 5, 5, 5, 5, 5) + start = -1 + stop = -1 + step = 10 + + +class ZlibComprTestCase(BasicTestCase): + compress = 1 + complib = "zlib" + start = 3 + # stop = 0 # means last row + stop = None # means last row from 0.8 on + step = 10 + + +class ZlibShuffleTestCase(BasicTestCase): + shuffle = 1 + compress = 1 + complib = "zlib" + # case start < stop , i.e. no rows read + start = 3 + stop = 1 + step = 10 + + +@common.unittest.skipIf(not common.blosc_avail, + 'BLOSC compression library not available') +class BloscComprTestCase(BasicTestCase): + compress = 1 # sss + complib = "blosc" + chunkshape = (10, 10) + start = 3 + stop = 10 + step = 3 + + +@common.unittest.skipIf(not common.blosc_avail, + 'BLOSC compression library not available') +class BloscShuffleTestCase(BasicTestCase): + shape = (20, 30) + compress = 1 + shuffle = 1 + complib = "blosc" + chunkshape = (100, 100) + start = 3 + stop = 10 + step = 7 + + +@common.unittest.skipIf(not common.blosc_avail, + 'BLOSC compression library not available') +@common.unittest.skipIf( + common.blosc_version < common.min_blosc_bitshuffle_version, + f'BLOSC >= {common.min_blosc_bitshuffle_version} required') +class BloscBitShuffleTestCase(BasicTestCase): + shape = (20, 30) + compress = 1 + bitshuffle = 1 + complib = "blosc" + chunkshape = (200, 100) + start = 2 + stop = 11 + step = 7 + + +@common.unittest.skipIf(not common.blosc_avail, + 'BLOSC compression library not available') +class BloscFletcherTestCase(BasicTestCase): + # see gh-21 + shape = (200, 300) + compress = 1 + shuffle = 1 + fletcher32 = 1 + complib = "blosc" + chunkshape = (100, 100) + start = 3 + stop = 10 + step = 7 + + +@common.unittest.skipIf(not common.blosc_avail, + 'BLOSC compression library not available') +class BloscBloscLZTestCase(BasicTestCase): + shape = (20, 30) + compress = 1 + shuffle = 1 + complib = "blosc:blosclz" + chunkshape = (200, 100) + start = 2 + stop = 11 + step = 7 + + +@common.unittest.skipIf(not common.blosc_avail, + 'BLOSC compression library not available') +@common.unittest.skipIf( + 'lz4' not in tb.blosc_compressor_list(), 'lz4 required') +class BloscLZ4TestCase(BasicTestCase): + shape = (20, 30) + compress = 1 + shuffle = 1 + complib = "blosc:lz4" + chunkshape = (100, 100) + start = 3 + stop = 10 + step = 7 + + +@common.unittest.skipIf(not common.blosc_avail, + 'BLOSC compression library not available') +@common.unittest.skipIf( + 'lz4' not in tb.blosc_compressor_list(), 'lz4 required') +class BloscLZ4HCTestCase(BasicTestCase): + shape = (20, 30) + compress = 1 + shuffle = 1 + complib = "blosc:lz4hc" + chunkshape = (100, 100) + start = 3 + stop = 10 + step = 7 + + +@common.unittest.skipIf(not common.blosc_avail, + 'BLOSC compression library not available') +@common.unittest.skipIf('snappy' not in tb.blosc_compressor_list(), + 'snappy required') +class BloscSnappyTestCase(BasicTestCase): + shape = (20, 30) + compress = 1 + shuffle = 1 + complib = "blosc:snappy" + chunkshape = (100, 100) + start = 3 + stop = 10 + step = 7 + + +@common.unittest.skipIf(not common.blosc_avail, + 'BLOSC compression library not available') +@common.unittest.skipIf( + 'zlib' not in tb.blosc_compressor_list(), 'zlib required') +class BloscZlibTestCase(BasicTestCase): + shape = (20, 30) + compress = 1 + shuffle = 1 + complib = "blosc:zlib" + chunkshape = (100, 100) + start = 3 + stop = 10 + step = 7 + + +@common.unittest.skipIf(not common.blosc_avail, + 'BLOSC compression library not available') +@common.unittest.skipIf( + 'zstd' not in tb.blosc_compressor_list(), 'zstd required') +class BloscZstdTestCase(BasicTestCase): + shape = (20, 30) + compress = 1 + shuffle = 1 + complib = "blosc:zstd" + chunkshape = (100, 100) + start = 3 + stop = 10 + step = 7 + + +@common.unittest.skipIf(not common.lzo_avail, + 'LZO compression library not available') +class LZOComprTestCase(BasicTestCase): + compress = 1 # sss + complib = "lzo" + chunkshape = (10, 10) + start = 3 + stop = 10 + step = 3 + + +@common.unittest.skipIf(not common.lzo_avail, + 'LZO compression library not available') +class LZOShuffleTestCase(BasicTestCase): + shape = (20, 30) + compress = 1 + shuffle = 1 + complib = "lzo" + chunkshape = (100, 100) + start = 3 + stop = 10 + step = 7 + + +@common.unittest.skipIf(not common.bzip2_avail, + 'BZIP2 compression library not available') +class Bzip2ComprTestCase(BasicTestCase): + shape = (20, 30) + compress = 1 + complib = "bzip2" + chunkshape = (100, 100) + start = 3 + stop = 10 + step = 8 + + +@common.unittest.skipIf(not common.bzip2_avail, + 'BZIP2 compression library not available') +class Bzip2ShuffleTestCase(BasicTestCase): + shape = (20, 30) + compress = 1 + shuffle = 1 + complib = "bzip2" + chunkshape = (100, 100) + start = 3 + stop = 10 + step = 6 + + +class Fletcher32TestCase(BasicTestCase): + shape = (60, 50) + compress = 0 + fletcher32 = 1 + chunkshape = (50, 50) + start = 4 + stop = 20 + step = 7 + + +class AllFiltersTestCase(BasicTestCase): + compress = 1 + shuffle = 1 + fletcher32 = 1 + complib = "zlib" + chunkshape = (20, 20) # sss + start = 2 + stop = 99 + step = 6 + + +class FloatTypeTestCase(BasicTestCase): + type = 'float64' + shape = (2, 2) + chunkshape = (5, 5) + start = 3 + stop = 10 + step = 20 + + +class ComplexTypeTestCase(BasicTestCase): + type = 'complex128' + shape = (2, 2) + chunkshape = (5, 5) + start = 3 + stop = 10 + step = 20 + + +class StringTestCase(BasicTestCase): + type = "string" + length = 20 + shape = (2, 2) + # shape = (2,2,20) + chunkshape = (5, 5) + start = 3 + stop = 10 + step = 20 + slices = (slice(0, 1), slice(1, 2)) + + +class String2TestCase(BasicTestCase): + type = "string" + length = 20 + shape = (2, 20) + chunkshape = (5, 5) + start = 1 + stop = 10 + step = 2 + + +class StringComprTestCase(BasicTestCase): + type = "string" + length = 20 + shape = (20, 2, 10) + # shape = (20,0,10,20) + compr = 1 + # shuffle = 1 # this shouldn't do nothing on chars + chunkshape = (50, 50, 2) + start = -1 + stop = 100 + step = 20 + + +class Int8TestCase(BasicTestCase): + type = "int8" + shape = (2, 2) + compress = 1 + shuffle = 1 + chunkshape = (50, 50) + start = -1 + stop = 100 + step = 20 + + +class Int16TestCase(BasicTestCase): + type = "int16" + shape = (2, 2) + compress = 1 + shuffle = 1 + chunkshape = (50, 50) + start = 1 + stop = 100 + step = 1 + + +class Int32TestCase(BasicTestCase): + type = "int32" + shape = (2, 2) + compress = 1 + shuffle = 1 + chunkshape = (50, 50) + start = -1 + stop = 100 + step = 20 + + +@common.unittest.skipUnless(hasattr(tb, 'Float16Atom'), + 'Float16Atom not available') +class Float16TestCase(BasicTestCase): + type = "float16" + shape = (200,) + compress = 1 + shuffle = 1 + chunkshape = (20,) + start = -1 + stop = 100 + step = 20 + + +class Float32TestCase(BasicTestCase): + type = "float32" + shape = (200,) + compress = 1 + shuffle = 1 + chunkshape = (20,) + start = -1 + stop = 100 + step = 20 + + +class Float64TestCase(BasicTestCase): + type = "float64" + shape = (200,) + compress = 1 + shuffle = 1 + chunkshape = (20,) + start = -1 + stop = 100 + step = 20 + + +@common.unittest.skipUnless(hasattr(tb, 'Float96Atom'), + 'Float96Atom not available') +class Float96TestCase(BasicTestCase): + type = "float96" + shape = (200,) + compress = 1 + shuffle = 1 + chunkshape = (20,) + start = -1 + stop = 100 + step = 20 + + +@common.unittest.skipUnless(hasattr(tb, 'Float128Atom'), + 'Float128Atom not available') +class Float128TestCase(BasicTestCase): + type = "float128" + shape = (200,) + compress = 1 + shuffle = 1 + chunkshape = (20,) + start = -1 + stop = 100 + step = 20 + + +class Complex64TestCase(BasicTestCase): + type = "complex64" + shape = (4,) + compress = 1 + shuffle = 1 + chunkshape = (2,) + start = -1 + stop = 100 + step = 20 + + +class Complex128TestCase(BasicTestCase): + type = "complex128" + shape = (20,) + compress = 1 + shuffle = 1 + chunkshape = (2,) + start = -1 + stop = 100 + step = 20 + + +@common.unittest.skipUnless(hasattr(tb, 'Complex192Atom'), + 'Complex192Atom not available') +class Complex192TestCase(BasicTestCase): + type = "complex192" + shape = (20,) + compress = 1 + shuffle = 1 + chunkshape = (2,) + start = -1 + stop = 100 + step = 20 + + +@common.unittest.skipUnless(hasattr(tb, 'Complex256Atom'), + 'Complex256Atom not available') +class Complex256TestCase(BasicTestCase): + type = "complex256" + shape = (20,) + compress = 1 + shuffle = 1 + chunkshape = (2,) + start = -1 + stop = 100 + step = 20 + + +class ComprTestCase(BasicTestCase): + type = "float64" + compress = 1 + shuffle = 1 + shape = (200,) + compr = 1 + chunkshape = (21,) + start = 51 + stop = 100 + step = 7 + + +# this is a subset of the tests in test_array.py, mostly to verify that errors +# are handled in the same way +class ReadOutArgumentTests(common.TempFileMixin, common.PyTablesTestCase): + + def setUp(self): + super().setUp() + self.size = 1000 + self.filters = tb.Filters(complevel=1, complib='blosc') + + def create_array(self): + array = np.arange(self.size, dtype='i8') + disk_array = self.h5file.create_carray('/', 'array', + atom=tb.Int64Atom(), + shape=(self.size,), + filters=self.filters) + disk_array[:] = array + return array, disk_array + + def test_read_entire_array(self): + array, disk_array = self.create_array() + out_buffer = np.empty((self.size, ), 'i8') + disk_array.read(out=out_buffer) + np.testing.assert_equal(out_buffer, array) + + def test_read_non_contiguous_buffer(self): + array, disk_array = self.create_array() + out_buffer = np.empty((self.size, ), 'i8') + out_buffer_slice = out_buffer[0:self.size:2] + + with self.assertRaisesRegex(ValueError, + 'output array not C contiguous'): + disk_array.read(0, self.size, 2, out_buffer_slice) + + def test_buffer_too_small(self): + array, disk_array = self.create_array() + out_buffer = np.empty((self.size // 2, ), 'i8') + self.assertRaises(ValueError, disk_array.read, 0, self.size, 1, + out_buffer) + try: + disk_array.read(0, self.size, 1, out_buffer) + except ValueError as exc: + self.assertIn('output array size invalid, got', str(exc)) + + +class SizeOnDiskInMemoryPropertyTestCase(common.TempFileMixin, + common.PyTablesTestCase): + + def setUp(self): + super().setUp() + self.array_size = (10_000, 10) + # set chunkshape so it divides evenly into array_size, to avoid + # partially filled chunks + self.chunkshape = (1000, 10) + # approximate size (in bytes) of non-data portion of hdf5 file + self.hdf_overhead = 6000 + + def create_array(self, complevel): + filters = tb.Filters(complevel=complevel, complib='blosc') + self.array = self.h5file.create_carray('/', 'somearray', + atom=tb.Int16Atom(), + shape=self.array_size, + filters=filters, + chunkshape=self.chunkshape) + + def test_no_data(self): + complevel = 0 + self.create_array(complevel) + self.assertEqual(self.array.size_on_disk, 0) + self.assertEqual(self.array.size_in_memory, 10_000 * 10 * 2) + + def test_data_no_compression(self): + complevel = 0 + self.create_array(complevel) + self.array[:] = 1 + self.assertEqual(self.array.size_on_disk, 10_000 * 10 * 2) + self.assertEqual(self.array.size_in_memory, 10_000 * 10 * 2) + + def test_highly_compressible_data(self): + complevel = 1 + self.create_array(complevel) + self.array[:] = 1 + self.h5file.flush() + file_size = Path(self.h5fname).stat().st_size + self.assertTrue( + abs(self.array.size_on_disk - file_size) <= self.hdf_overhead) + self.assertTrue(self.array.size_on_disk < self.array.size_in_memory) + self.assertEqual(self.array.size_in_memory, 10_000 * 10 * 2) + + # XXX + def test_random_data(self): + complevel = 1 + self.create_array(complevel) + self.array[:] = np.random.randint(0, 1e6, self.array_size) + self.h5file.flush() + file_size = Path(self.h5fname).stat().st_size + self.assertTrue( + abs(self.array.size_on_disk - file_size) <= self.hdf_overhead) + + # XXX: check. The test fails if blosc is not available + if tb.which_lib_version('blosc') is not None: + self.assertAlmostEqual(self.array.size_on_disk, 10_000 * 10 * 2) + else: + self.assertTrue( + abs(self.array.size_on_disk - 10_000 * 10 * 2) < 200) + + +class OffsetStrideTestCase(common.TempFileMixin, common.PyTablesTestCase): + compress = 0 + complib = "zlib" # Default compression library + + def setUp(self): + super().setUp() + # Create an instance of an HDF5 Table + self.rootgroup = self.h5file.root + + def test01a_String(self): + """Checking carray with offseted NumPy strings appends.""" + + root = self.rootgroup + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test01a_String..." % self.__class__.__name__) + + shape = (3, 2, 2) + # Create an string atom + carray = self.h5file.create_carray(root, 'strings', + atom=tb.StringAtom(itemsize=3), + shape=shape, + title="Array of strings", + chunkshape=(1, 2, 2)) + a = np.array([[["a", "b"], ["123", "45"], ["45", "123"]]], dtype="S3") + carray[0] = a[0, 1:] + a = np.array([[["s", "a"], ["ab", "f"], ["s", "abc"], ["abc", "f"]]]) + carray[1] = a[0, 2:] + + # Read all the data: + data = carray.read() + if common.verbose: + print("Object read:", data) + print("Nrows in", carray._v_pathname, ":", carray.nrows) + print("Second row in carray ==>", data[1].tolist()) + + self.assertEqual(carray.nrows, 3) + self.assertEqual(data[0].tolist(), [[b"123", b"45"], [b"45", b"123"]]) + self.assertEqual(data[1].tolist(), [[b"s", b"abc"], [b"abc", b"f"]]) + self.assertEqual(len(data[0]), 2) + self.assertEqual(len(data[1]), 2) + + def test01b_String(self): + """Checking carray with strided NumPy strings appends.""" + + root = self.rootgroup + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test01b_String..." % self.__class__.__name__) + + shape = (3, 2, 2) + + # Create an string atom + carray = self.h5file.create_carray(root, 'strings', + atom=tb.StringAtom(itemsize=3), + shape=shape, + title="Array of strings", + chunkshape=(1, 2, 2)) + a = np.array([[["a", "b"], ["123", "45"], ["45", "123"]]], dtype="S3") + carray[0] = a[0, ::2] + a = np.array([[["s", "a"], ["ab", "f"], ["s", "abc"], ["abc", "f"]]]) + carray[1] = a[0, ::2] + + # Read all the rows: + data = carray.read() + if common.verbose: + print("Object read:", data) + print("Nrows in", carray._v_pathname, ":", carray.nrows) + print("Second row in carray ==>", data[1].tolist()) + + self.assertEqual(carray.nrows, 3) + self.assertEqual(data[0].tolist(), [[b"a", b"b"], [b"45", b"123"]]) + self.assertEqual(data[1].tolist(), [[b"s", b"a"], [b"s", b"abc"]]) + self.assertEqual(len(data[0]), 2) + self.assertEqual(len(data[1]), 2) + + def test02a_int(self): + """Checking carray with offseted NumPy ints appends.""" + + root = self.rootgroup + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test02a_int..." % self.__class__.__name__) + + shape = (3, 3) + + # Create an string atom + carray = self.h5file.create_carray(root, 'CAtom', + atom=tb.Int32Atom(), shape=shape, + title="array of ints", + chunkshape=(1, 3)) + a = np.array([(0, 0, 0), (1, 0, 3), (1, 1, 1), (0, 0, 0)], + dtype='int32') + carray[0:2] = a[2:] # Introduce an offset + a = np.array([(1, 1, 1), (-1, 0, 0)], dtype='int32') + carray[2:3] = a[1:] # Introduce an offset + + # Read all the rows: + data = carray.read() + if common.verbose: + print("Object read:", data) + print("Nrows in", carray._v_pathname, ":", carray.nrows) + print("Third row in carray ==>", data[2]) + + self.assertEqual(carray.nrows, 3) + self.assertTrue(common.allequal( + data[0], np.array([1, 1, 1], dtype='int32'))) + self.assertTrue(common.allequal( + data[1], np.array([0, 0, 0], dtype='int32'))) + self.assertTrue(common.allequal( + data[2], np.array([-1, 0, 0], dtype='int32'))) + + def test02b_int(self): + """Checking carray with strided NumPy ints appends.""" + + root = self.rootgroup + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test02b_int..." % self.__class__.__name__) + + shape = (3, 3) + + # Create an string atom + carray = self.h5file.create_carray(root, 'CAtom', + atom=tb.Int32Atom(), shape=shape, + title="array of ints", + chunkshape=(1, 3)) + a = np.array([(0, 0, 0), (1, 0, 3), (1, 1, 1), (3, 3, 3)], + dtype='int32') + carray[0:2] = a[::3] # Create an offset + a = np.array([(1, 1, 1), (-1, 0, 0)], dtype='int32') + carray[2:3] = a[::2] # Create an offset + + # Read all the rows: + data = carray.read() + if common.verbose: + print("Object read:", data) + print("Nrows in", carray._v_pathname, ":", carray.nrows) + print("Third row in carray ==>", data[2]) + + self.assertEqual(carray.nrows, 3) + self.assertTrue(common.allequal( + data[0], np.array([0, 0, 0], dtype='int32'))) + self.assertTrue(common.allequal( + data[1], np.array([3, 3, 3], dtype='int32'))) + self.assertTrue(common.allequal( + data[2], np.array([1, 1, 1], dtype='int32'))) + + +class CopyTestCase(common.TempFileMixin, common.PyTablesTestCase): + + def test01a_copy(self): + """Checking CArray.copy() method.""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test01a_copy..." % self.__class__.__name__) + + # Create an CArray + shape = (2, 2) + atom = tb.Int16Atom() + array1 = self.h5file.create_carray( + self.h5file.root, 'array1', atom=atom, shape=shape, + title="title array1", chunkshape=(2, 2)) + array1[...] = np.array([[456, 2], [3, 457]], dtype='int16') + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen(mode="a") + array1 = self.h5file.root.array1 + + # Copy it to another location + array2 = array1.copy('/', 'array2') + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen() + array1 = self.h5file.root.array1 + array2 = self.h5file.root.array2 + + if common.verbose: + print("array1-->", array1.read()) + print("array2-->", array2.read()) + # print("dirs-->", dir(array1), dir(array2)) + print("attrs array1-->", repr(array1.attrs)) + print("attrs array2-->", repr(array2.attrs)) + + # Check that all the elements are equal + self.assertTrue(common.allequal(array1.read(), array2.read())) + + # Assert other properties in array + self.assertEqual(array1.nrows, array2.nrows) + self.assertEqual(array1.shape, array2.shape) + self.assertEqual(array1.extdim, array2.extdim) + self.assertEqual(array1.flavor, array2.flavor) + self.assertEqual(array1.atom.dtype, array2.atom.dtype) + self.assertEqual(array1.atom.type, array2.atom.type) + self.assertEqual(array1.title, array2.title) + self.assertEqual(str(array1.atom), str(array2.atom)) + # The next line is commented out because a copy should not + # keep the same chunkshape anymore. + # F. Alted 2006-11-27 + # self.assertEqual(array1.chunkshape, array2.chunkshape) + + def test01b_copy(self): + """Checking CArray.copy() method.""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test01b_copy..." % self.__class__.__name__) + + # Create an CArray + shape = (2, 2) + atom = tb.Int16Atom() + array1 = self.h5file.create_carray( + self.h5file.root, 'array1', atom=atom, shape=shape, + title="title array1", chunkshape=(5, 5)) + array1[...] = np.array([[456, 2], [3, 457]], dtype='int16') + + if self.close: + if common.verbose: + print("(closing h5fname version)") + self._reopen(mode="a") + array1 = self.h5file.root.array1 + + # Copy it to another location + array2 = array1.copy('/', 'array2') + + if self.close: + if common.verbose: + print("(closing h5fname version)") + self._reopen() + array1 = self.h5file.root.array1 + array2 = self.h5file.root.array2 + + if common.verbose: + print("array1-->", array1.read()) + print("array2-->", array2.read()) + # print("dirs-->", dir(array1), dir(array2)) + print("attrs array1-->", repr(array1.attrs)) + print("attrs array2-->", repr(array2.attrs)) + + # Check that all the elements are equal + self.assertTrue(common.allequal(array1.read(), array2.read())) + + # Assert other properties in array + self.assertEqual(array1.nrows, array2.nrows) + self.assertEqual(array1.shape, array2.shape) + self.assertEqual(array1.extdim, array2.extdim) + self.assertEqual(array1.flavor, array2.flavor) + self.assertEqual(array1.atom.dtype, array2.atom.dtype) + self.assertEqual(array1.atom.type, array2.atom.type) + self.assertEqual(array1.title, array2.title) + self.assertEqual(str(array1.atom), str(array2.atom)) + # By default, the chunkshape should be the same + self.assertEqual(array1.chunkshape, array2.chunkshape) + + def test01c_copy(self): + """Checking CArray.copy() method.""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test01c_copy..." % self.__class__.__name__) + + # Create an CArray + shape = (5, 5) + atom = tb.Int16Atom() + array1 = self.h5file.create_carray( + self.h5file.root, 'array1', atom=atom, shape=shape, + title="title array1", chunkshape=(2, 2)) + array1[:2, :2] = np.array([[456, 2], [3, 457]], dtype='int16') + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen(mode="a") + array1 = self.h5file.root.array1 + + # Copy it to another location + array2 = array1.copy('/', 'array2') + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen() + array1 = self.h5file.root.array1 + array2 = self.h5file.root.array2 + + if common.verbose: + print("array1-->", array1.read()) + print("array2-->", array2.read()) + # print("dirs-->", dir(array1), dir(array2)) + print("attrs array1-->", repr(array1.attrs)) + print("attrs array2-->", repr(array2.attrs)) + + # Check that all the elements are equal + self.assertTrue(common.allequal(array1.read(), array2.read())) + + # Assert other properties in array + self.assertEqual(array1.nrows, array2.nrows) + self.assertEqual(array1.shape, array2.shape) + self.assertEqual(array1.extdim, array2.extdim) + self.assertEqual(array1.flavor, array2.flavor) + self.assertEqual(array1.atom.dtype, array2.atom.dtype) + self.assertEqual(array1.atom.type, array2.atom.type) + self.assertEqual(array1.title, array2.title) + self.assertEqual(str(array1.atom), str(array2.atom)) + # The next line is commented out because a copy should not + # keep the same chunkshape anymore. + # F. Alted 2006-11-27 + # self.assertEqual(array1.chunkshape, array2.chunkshape) + + def test02_copy(self): + """Checking CArray.copy() method (where specified)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test02_copy..." % self.__class__.__name__) + + # Create an CArray + shape = (5, 5) + atom = tb.Int16Atom() + array1 = self.h5file.create_carray( + self.h5file.root, 'array1', atom=atom, shape=shape, + title="title array1", chunkshape=(2, 2)) + array1[:2, :2] = np.array([[456, 2], [3, 457]], dtype='int16') + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen(mode="a") + array1 = self.h5file.root.array1 + + # Copy to another location + group1 = self.h5file.create_group("/", "group1") + array2 = array1.copy(group1, 'array2') + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen() + array1 = self.h5file.root.array1 + array2 = self.h5file.root.group1.array2 + + if common.verbose: + print("array1-->", array1.read()) + print("array2-->", array2.read()) + # print("dirs-->", dir(array1), dir(array2)) + print("attrs array1-->", repr(array1.attrs)) + print("attrs array2-->", repr(array2.attrs)) + + # Check that all the elements are equal + self.assertTrue(common.allequal(array1.read(), array2.read())) + + # Assert other properties in array + self.assertEqual(array1.nrows, array2.nrows) + self.assertEqual(array1.shape, array2.shape) + self.assertEqual(array1.extdim, array2.extdim) + self.assertEqual(array1.flavor, array2.flavor) + self.assertEqual(array1.atom.dtype, array2.atom.dtype) + self.assertEqual(array1.atom.type, array2.atom.type) + self.assertEqual(array1.title, array2.title) + self.assertEqual(str(array1.atom), str(array2.atom)) + # The next line is commented out because a copy should not + # keep the same chunkshape anymore. + # F. Alted 2006-11-27 + # self.assertEqual(array1.chunkshape, array2.chunkshape) + + def test03a_copy(self): + """Checking CArray.copy() method (python flavor)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test03c_copy..." % self.__class__.__name__) + + shape = (2, 2) + atom = tb.Int16Atom() + array1 = self.h5file.create_carray( + self.h5file.root, 'array1', atom=atom, shape=shape, + title="title array1", chunkshape=(2, 2)) + array1.flavor = "python" + array1[...] = [[456, 2], [3, 457]] + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen(mode="a") + array1 = self.h5file.root.array1 + + # Copy to another location + array2 = array1.copy('/', 'array2') + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen() + array1 = self.h5file.root.array1 + array2 = self.h5file.root.array2 + + if common.verbose: + print("attrs array1-->", repr(array1.attrs)) + print("attrs array2-->", repr(array2.attrs)) + + # Check that all elements are equal + self.assertEqual(array1.read(), array2.read()) + # Assert other properties in array + self.assertEqual(array1.nrows, array2.nrows) + self.assertEqual(array1.shape, array2.shape) + self.assertEqual(array1.extdim, array2.extdim) + self.assertEqual(array1.flavor, array2.flavor) # Very important here! + self.assertEqual(array1.atom.dtype, array2.atom.dtype) + self.assertEqual(array1.atom.type, array2.atom.type) + self.assertEqual(array1.title, array2.title) + self.assertEqual(str(array1.atom), str(array2.atom)) + # The next line is commented out because a copy should not + # keep the same chunkshape anymore. + # F. Alted 2006-11-27 + # self.assertEqual(array1.chunkshape, array2.chunkshape) + + def test03b_copy(self): + """Checking CArray.copy() method (string python flavor)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test03d_copy..." % self.__class__.__name__) + + shape = (2, 2) + atom = tb.StringAtom(itemsize=4) + array1 = self.h5file.create_carray( + self.h5file.root, 'array1', atom=atom, shape=shape, + title="title array1", chunkshape=(2, 2)) + array1.flavor = "python" + array1[...] = [["456", "2"], ["3", "457"]] + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen(mode="a") + array1 = self.h5file.root.array1 + + # Copy to another location + array2 = array1.copy('/', 'array2') + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen() + array1 = self.h5file.root.array1 + array2 = self.h5file.root.array2 + + if common.verbose: + print("type value-->", type(array2[:][0][0])) + print("value-->", array2[:]) + print("attrs array1-->", repr(array1.attrs)) + print("attrs array2-->", repr(array2.attrs)) + + # Check that all elements are equal + self.assertEqual(array1.read(), array2.read()) + + # Assert other properties in array + self.assertEqual(array1.nrows, array2.nrows) + self.assertEqual(array1.shape, array2.shape) + self.assertEqual(array1.extdim, array2.extdim) + self.assertEqual(array1.flavor, array2.flavor) # Very important here! + self.assertEqual(array1.atom.dtype, array2.atom.dtype) + self.assertEqual(array1.atom.type, array2.atom.type) + self.assertEqual(array1.title, array2.title) + self.assertEqual(str(array1.atom), str(array2.atom)) + # The next line is commented out because a copy should not + # keep the same chunkshape anymore. + # F. Alted 2006-11-27 + # self.assertEqual(array1.chunkshape, array2.chunkshape) + + def test03c_copy(self): + """Checking CArray.copy() method (chararray flavor)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test03e_copy..." % self.__class__.__name__) + + shape = (2, 2) + atom = tb.StringAtom(itemsize=4) + array1 = self.h5file.create_carray( + self.h5file.root, 'array1', atom=atom, shape=shape, + title="title array1", chunkshape=(2, 2)) + array1[...] = np.array([["456", "2"], ["3", "457"]], dtype="S4") + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen(mode="a") + array1 = self.h5file.root.array1 + + # Copy to another location + array2 = array1.copy('/', 'array2') + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen() + array1 = self.h5file.root.array1 + array2 = self.h5file.root.array2 + + if common.verbose: + print("attrs array1-->", repr(array1.attrs)) + print("attrs array2-->", repr(array2.attrs)) + + # Check that all elements are equal + self.assertTrue(common.allequal(array1.read(), array2.read())) + # Assert other properties in array + self.assertEqual(array1.nrows, array2.nrows) + self.assertEqual(array1.shape, array2.shape) + self.assertEqual(array1.extdim, array2.extdim) + self.assertEqual(array1.flavor, array2.flavor) # Very important here! + self.assertEqual(array1.atom.dtype, array2.atom.dtype) + self.assertEqual(array1.atom.type, array2.atom.type) + self.assertEqual(array1.title, array2.title) + self.assertEqual(str(array1.atom), str(array2.atom)) + # The next line is commented out because a copy should not + # keep the same chunkshape anymore. + # F. Alted 2006-11-27 + # self.assertEqual(array1.chunkshape, array2.chunkshape) + + def test04_copy(self): + """Checking CArray.copy() method (checking title copying)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test04_copy..." % self.__class__.__name__) + + # Create an CArray + shape = (2, 2) + atom = tb.Int16Atom() + array1 = self.h5file.create_carray( + self.h5file.root, 'array1', atom=atom, shape=shape, + title="title array1", chunkshape=(2, 2)) + array1[...] = np.array([[456, 2], [3, 457]], dtype='int16') + + # Append some user attrs + array1.attrs.attr1 = "attr1" + array1.attrs.attr2 = 2 + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen(mode="a") + array1 = self.h5file.root.array1 + + # Copy it to another Array + array2 = array1.copy('/', 'array2', title="title array2") + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen() + array1 = self.h5file.root.array1 + array2 = self.h5file.root.array2 + + # Assert user attributes + if common.verbose: + print("title of destination array-->", array2.title) + self.assertEqual(array2.title, "title array2") + + def test05_copy(self): + """Checking CArray.copy() method (user attributes copied)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test05_copy..." % self.__class__.__name__) + + # Create an CArray + shape = (2, 2) + atom = tb.Int16Atom() + array1 = self.h5file.create_carray( + self.h5file.root, 'array1', atom=atom, shape=shape, + title="title array1", chunkshape=(2, 2)) + array1[...] = np.array([[456, 2], [3, 457]], dtype='int16') + + # Append some user attrs + array1.attrs.attr1 = "attr1" + array1.attrs.attr2 = 2 + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen(mode="a") + array1 = self.h5file.root.array1 + + # Copy it to another Array + array2 = array1.copy('/', 'array2', copyuserattrs=1) + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen() + array1 = self.h5file.root.array1 + array2 = self.h5file.root.array2 + + if common.verbose: + print("attrs array1-->", repr(array1.attrs)) + print("attrs array2-->", repr(array2.attrs)) + + # Assert user attributes + self.assertEqual(array2.attrs.attr1, "attr1") + self.assertEqual(array2.attrs.attr2, 2) + + def test05b_copy(self): + """Checking CArray.copy() method (user attributes not copied)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test05b_copy..." % self.__class__.__name__) + + # Create an Array + shape = (2, 2) + atom = tb.Int16Atom() + array1 = self.h5file.create_carray( + self.h5file.root, 'array1', atom=atom, shape=shape, + title="title array1", chunkshape=(2, 2)) + array1[...] = np.array([[456, 2], [3, 457]], dtype='int16') + + # Append some user attrs + array1.attrs.attr1 = "attr1" + array1.attrs.attr2 = 2 + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen(mode="a") + array1 = self.h5file.root.array1 + + # Copy it to another Array + array2 = array1.copy('/', 'array2', copyuserattrs=0) + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen() + array1 = self.h5file.root.array1 + array2 = self.h5file.root.array2 + + if common.verbose: + print("attrs array1-->", repr(array1.attrs)) + print("attrs array2-->", repr(array2.attrs)) + + # Assert user attributes + self.assertEqual(hasattr(array2.attrs, "attr1"), 0) + self.assertEqual(hasattr(array2.attrs, "attr2"), 0) + + +class CloseCopyTestCase(CopyTestCase): + close = 1 + + +class OpenCopyTestCase(CopyTestCase): + close = 0 + + +class CopyIndexTestCase(common.TempFileMixin, common.PyTablesTestCase): + nrowsinbuf = 2 + + def test01_index(self): + """Checking CArray.copy() method with indexes.""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test01_index..." % self.__class__.__name__) + + # Create an CArray + shape = (100, 2) + atom = tb.Int32Atom() + array1 = self.h5file.create_carray( + self.h5file.root, 'array1', atom=atom, shape=shape, + title="title array1", chunkshape=(2, 2)) + r = np.arange(200, dtype='int32') + r.shape = shape + array1[...] = r + + # Select a different buffer size: + array1.nrowsinbuf = self.nrowsinbuf + + # Copy to another array + array2 = array1.copy("/", 'array2', + start=self.start, + stop=self.stop, + step=self.step) + if common.verbose: + print("array1-->", array1.read()) + print("array2-->", array2.read()) + print("attrs array1-->", repr(array1.attrs)) + print("attrs array2-->", repr(array2.attrs)) + + # Check that all the elements are equal + r2 = r[self.start:self.stop:self.step] + self.assertTrue(common.allequal(r2, array2.read())) + + # Assert the number of rows in array + if common.verbose: + print("nrows in array2-->", array2.nrows) + print("and it should be-->", r2.shape[0]) + + # The next line is commented out because a copy should not + # keep the same chunkshape anymore. + # F. Alted 2006-11-27 + # assert array1.chunkshape == array2.chunkshape + self.assertEqual(r2.shape[0], array2.nrows) + + def _test02_indexclosef(self): + """Checking CArray.copy() method with indexes (close file version)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test02_indexclosef..." % self.__class__.__name__) + + # Create an CArray + shape = (100, 2) + atom = tb.Int32Atom() + array1 = self.h5file.create_carray( + self.h5file.root, 'array1', atom=atom, shape=shape, + title="title array1", chunkshape=(2, 2)) + r = np.arange(200, dtype='int32') + r.shape = shape + array1[...] = r + + # Select a different buffer size: + array1.nrowsinbuf = self.nrowsinbuf + + # Copy to another array + array2 = array1.copy("/", 'array2', + start=self.start, + stop=self.stop, + step=self.step) + + # Close and reopen the file + self._reopen() + array1 = self.h5file.root.array1 + array2 = self.h5file.root.array2 + + if common.verbose: + print("array1-->", array1.read()) + print("array2-->", array2.read()) + print("attrs array1-->", repr(array1.attrs)) + print("attrs array2-->", repr(array2.attrs)) + + # Check that all the elements are equal + r2 = r[self.start:self.stop:self.step] + self.assertEqual(array1.chunkshape, array2.chunkshape) + self.assertTrue(common.allequal(r2, array2.read())) + + # Assert the number of rows in array + if common.verbose: + print("nrows in array2-->", array2.nrows) + print("and it should be-->", r2.shape[0]) + self.assertEqual(r2.shape[0], array2.nrows) + + +class CopyIndex1TestCase(CopyIndexTestCase): + nrowsinbuf = 1 + start = 0 + stop = 7 + step = 1 + + +class CopyIndex2TestCase(CopyIndexTestCase): + nrowsinbuf = 2 + start = 0 + stop = -1 + step = 1 + + +class CopyIndex3TestCase(CopyIndexTestCase): + nrowsinbuf = 3 + start = 1 + stop = 7 + step = 1 + + +class CopyIndex4TestCase(CopyIndexTestCase): + nrowsinbuf = 4 + start = 0 + stop = 6 + step = 1 + + +class CopyIndex5TestCase(CopyIndexTestCase): + nrowsinbuf = 2 + start = 3 + stop = 7 + step = 1 + + +class CopyIndex6TestCase(CopyIndexTestCase): + nrowsinbuf = 2 + start = 3 + stop = 6 + step = 2 + + +class CopyIndex7TestCase(CopyIndexTestCase): + start = 0 + stop = 7 + step = 10 + + +class CopyIndex8TestCase(CopyIndexTestCase): + start = 6 + stop = -1 # Negative values means starting from the end + step = 1 + + +class CopyIndex9TestCase(CopyIndexTestCase): + start = 3 + stop = 4 + step = 1 + + +class CopyIndex10TestCase(CopyIndexTestCase): + nrowsinbuf = 1 + start = 3 + stop = 4 + step = 2 + + +class CopyIndex11TestCase(CopyIndexTestCase): + start = -3 + stop = -1 + step = 2 + + +class CopyIndex12TestCase(CopyIndexTestCase): + start = -1 # Should point to the last element + stop = None # None should mean the last element (including it) + step = 1 + + +# The next test should be run only in **heavy** mode +class Rows64bitsTestCase(common.TempFileMixin, common.PyTablesTestCase): + narows = 1000 * 1000 # each array will have 1 million entries + # narows = 1000 # for testing only + nanumber = 1000 * 3 # That should account for more than 2**31-1 + + def setUp(self): + super().setUp() + + # Create an CArray + shape = (self.narows * self.nanumber,) + array = self.h5file.create_carray( + self.h5file.root, 'array', + atom=tb.Int8Atom(), shape=shape, + filters=tb.Filters(complib='lzo', complevel=1)) + + # Fill the array + na = np.arange(self.narows, dtype='int8') + # for i in xrange(self.nanumber): + # s = slice(i * self.narows, (i + 1)*self.narows) + # array[s] = na + s = slice(0, self.narows) + array[s] = na + s = slice((self.nanumber-1)*self.narows, self.nanumber * self.narows) + array[s] = na + + def test01_basiccheck(self): + """Some basic checks for carrays exceeding 2**31 rows""" + + array = self.h5file.root.array + + if self.close: + if common.verbose: + # Check how many entries there are in the array + print("Before closing") + print("Entries:", array.nrows, type(array.nrows)) + print("Entries:", array.nrows / (1000 * 1000), "Millions") + print("Shape:", array.shape) + + # Re-open the file + self._reopen() + array = self.h5file.root.array + if common.verbose: + print("After re-open") + + # Check how many entries there are in the array + if common.verbose: + print("Entries:", array.nrows, type(array.nrows)) + print("Entries:", array.nrows / (1000 * 1000), "Millions") + print("Shape:", array.shape) + print("Last 10 elements-->", array[-10:]) + stop = self.narows % 256 + if stop > 127: + stop -= 256 + start = stop - 10 + # print("start, stop-->", start, stop) + print("Should look like:", np.arange(start, stop, dtype='int8')) + + nrows = self.narows * self.nanumber + + # check nrows + self.assertEqual(array.nrows, nrows) + + # Check shape + self.assertEqual(array.shape, (nrows,)) + + # check the 10 first elements + self.assertTrue(common.allequal( + array[:10], np.arange(10, dtype='int8'))) + + # check the 10 last elements + stop = self.narows % 256 + if stop > 127: + stop -= 256 + start = stop - 10 + self.assertTrue(common.allequal( + array[-10:], np.arange(start, stop, dtype='int8'))) + + +class Rows64bitsTestCase1(Rows64bitsTestCase): + close = 0 + + +class Rows64bitsTestCase2(Rows64bitsTestCase): + close = 1 + + +class BigArrayTestCase(common.TempFileMixin, common.PyTablesTestCase): + shape = (3_000_000_000,) # more than 2**31-1 + + def setUp(self): + super().setUp() + # This should be fast since disk space isn't actually allocated, + # so this case is OK for non-heavy test runs. + self.h5file.create_carray('/', 'array', + atom=tb.Int8Atom(), shape=self.shape) + + def test00_shape(self): + """Check that the shape doesn't overflow.""" + # See ticket #147. + self.assertEqual(self.h5file.root.array.shape, self.shape) + try: + self.assertEqual(len(self.h5file.root.array), self.shape[0]) + except OverflowError: + # This can't be avoided in 32-bit platforms. + self.assertTrue(self.shape[0] > np.iinfo(int).max, + "Array length overflowed but ``int`` " + "is wide enough.") + + def test01_shape_reopen(self): + """Check that the shape doesn't overflow after reopening.""" + self._reopen('r') + self.test00_shape() + + +# Test for default values when creating arrays. +class DfltAtomTestCase(common.TempFileMixin, common.PyTablesTestCase): + + def test00_dflt(self): + """Check that Atom.dflt is honored (string version).""" + + # Create a CArray with default values + self.h5file.create_carray( + '/', 'bar', atom=tb.StringAtom(itemsize=5, dflt=b"abdef"), + shape=(10, 10)) + + if self.reopen: + self._reopen() + + # Check the values + values = self.h5file.root.bar[:] + if common.verbose: + print("Read values:", values) + self.assertTrue(common.allequal( + values, np.array(["abdef"] * 100, "S5").reshape(10, 10))) + + def test01_dflt(self): + """Check that Atom.dflt is honored (int version).""" + + # Create a CArray with default values + self.h5file.create_carray('/', 'bar', + atom=tb.IntAtom(dflt=1), shape=(10, 10)) + + if self.reopen: + self._reopen() + + # Check the values + values = self.h5file.root.bar[:] + if common.verbose: + print("Read values:", values) + self.assertTrue(common.allequal(values, np.ones((10, 10), "i4"))) + + def test02_dflt(self): + """Check that Atom.dflt is honored (float version).""" + + # Create a CArray with default values + self.h5file.create_carray( + '/', 'bar', atom=tb.FloatAtom(dflt=1.134), shape=(10, 10)) + + if self.reopen: + self._reopen() + + # Check the values + values = self.h5file.root.bar[:] + if common.verbose: + print("Read values:", values) + self.assertTrue(common.allequal(values, np.ones((10, 10), "f8")*1.134)) + + +class DfltAtomNoReopen(DfltAtomTestCase): + reopen = False + + +class DfltAtomReopen(DfltAtomTestCase): + reopen = True + + +# Test for representation of defaults in atoms. Ticket #212. +class AtomDefaultReprTestCase(common.TempFileMixin, common.PyTablesTestCase): + + def test00a_zeros(self): + """Testing default values. Zeros (scalar).""" + + N = () + atom = tb.StringAtom(itemsize=3, shape=N, dflt=b"") + ca = self.h5file.create_carray('/', 'test', atom=atom, shape=(1,)) + if self.reopen: + self._reopen('a') + ca = self.h5file.root.test + # Check the value + if common.verbose: + print("First row-->", repr(ca[0])) + print("Defaults-->", repr(ca.atom.dflt)) + self.assertTrue(common.allequal(ca[0], np.zeros(N, 'S3'))) + self.assertTrue(common.allequal(ca.atom.dflt, np.zeros(N, 'S3'))) + + def test00b_zeros(self): + """Testing default values. Zeros (array).""" + + N = 2 + atom = tb.StringAtom(itemsize=3, shape=N, dflt=b"") + ca = self.h5file.create_carray('/', 'test', atom=atom, shape=(1,)) + if self.reopen: + self._reopen('a') + ca = self.h5file.root.test + # Check the value + if common.verbose: + print("First row-->", ca[0]) + print("Defaults-->", ca.atom.dflt) + self.assertTrue(common.allequal(ca[0], np.zeros(N, 'S3'))) + self.assertTrue(common.allequal(ca.atom.dflt, np.zeros(N, 'S3'))) + + def test01a_values(self): + """Testing default values. Ones.""" + + N = 2 + atom = tb.Int32Atom(shape=N, dflt=1) + ca = self.h5file.create_carray('/', 'test', atom=atom, shape=(1,)) + if self.reopen: + self._reopen('a') + ca = self.h5file.root.test + # Check the value + if common.verbose: + print("First row-->", ca[0]) + print("Defaults-->", ca.atom.dflt) + self.assertTrue(common.allequal(ca[0], np.ones(N, 'i4'))) + self.assertTrue(common.allequal(ca.atom.dflt, np.ones(N, 'i4'))) + + def test01b_values(self): + """Testing default values. Generic value.""" + + N = 2 + generic = 112.32 + atom = tb.Float32Atom(shape=N, dflt=generic) + ca = self.h5file.create_carray('/', 'test', atom=atom, shape=(1,)) + if self.reopen: + self._reopen('a') + ca = self.h5file.root.test + # Check the value + if common.verbose: + print("First row-->", ca[0]) + print("Defaults-->", ca.atom.dflt) + self.assertTrue(common.allequal(ca[0], np.ones(N, 'f4')*generic)) + self.assertTrue(common.allequal( + ca.atom.dflt, np.ones(N, 'f4')*generic)) + + def test02a_None(self): + """Testing default values. None (scalar).""" + + N = () + atom = tb.Int32Atom(shape=N, dflt=None) + ca = self.h5file.create_carray('/', 'test', atom=atom, shape=(1,)) + if self.reopen: + self._reopen('a') + ca = self.h5file.root.test + # Check the value + if common.verbose: + print("First row-->", repr(ca[0])) + print("Defaults-->", repr(ca.atom.dflt)) + self.assertTrue(common.allequal(ca.atom.dflt, np.zeros(N, 'i4'))) + + def test02b_None(self): + """Testing default values. None (array).""" + + N = 2 + atom = tb.Int32Atom(shape=N, dflt=None) + ca = self.h5file.create_carray('/', 'test', atom=atom, shape=(1,)) + if self.reopen: + self._reopen('a') + ca = self.h5file.root.test + # Check the value + if common.verbose: + print("First row-->", ca[0]) + print("Defaults-->", ca.atom.dflt) + self.assertTrue(common.allequal(ca.atom.dflt, np.zeros(N, 'i4'))) + + +class AtomDefaultReprNoReopen(AtomDefaultReprTestCase): + reopen = False + + +class AtomDefaultReprReopen(AtomDefaultReprTestCase): + reopen = True + + +class TruncateTestCase(common.TempFileMixin, common.PyTablesTestCase): + def test(self): + """Test for unability to truncate Array objects.""" + + array1 = self.h5file.create_carray('/', 'array1', tb.IntAtom(), [2, 2]) + self.assertRaises(TypeError, array1.truncate, 0) + + +# Test for dealing with multidimensional atoms +class MDAtomTestCase(common.TempFileMixin, common.PyTablesTestCase): + + def test01a_assign(self): + """Assign a row to a (unidimensional) CArray with a MD atom.""" + + # Create an CArray + ca = self.h5file.create_carray('/', 'test', + atom=tb.Int32Atom((2, 2)), shape=(1,)) + if self.reopen: + self._reopen('a') + ca = self.h5file.root.test + # Assign one row + ca[0] = [[1, 3], [4, 5]] + self.assertEqual(ca.nrows, 1) + if common.verbose: + print("First row-->", ca[0]) + self.assertTrue(common.allequal( + ca[0], np.array([[1, 3], [4, 5]], 'i4'))) + + def test01b_assign(self): + """Assign several rows to a (unidimensional) CArray with a MD atom.""" + + # Create an CArray + ca = self.h5file.create_carray('/', 'test', + atom=tb.Int32Atom((2, 2)), shape=(3,)) + if self.reopen: + self._reopen('a') + ca = self.h5file.root.test + # Assign three rows + ca[:] = [[[1]], [[2]], [[3]]] # Simple broadcast + self.assertEqual(ca.nrows, 3) + if common.verbose: + print("Third row-->", ca[2]) + self.assertTrue(common.allequal( + ca[2], np.array([[3, 3], [3, 3]], 'i4'))) + + def test02a_assign(self): + """Assign a row to a (multidimensional) CArray with a MD atom.""" + + # Create an CArray + ca = self.h5file.create_carray('/', 'test', + atom=tb.Int32Atom((2,)), shape=(1, 3)) + if self.reopen: + self._reopen('a') + ca = self.h5file.root.test + # Assign one row + ca[:] = [[[1, 3], [4, 5], [7, 9]]] + self.assertEqual(ca.nrows, 1) + if common.verbose: + print("First row-->", ca[0]) + self.assertTrue(common.allequal(ca[0], np.array( + [[1, 3], [4, 5], [7, 9]], 'i4'))) + + def test02b_assign(self): + """Assign several rows to a (multidimensional) CArray with + a MD atom.""" + + # Create an CArray + ca = self.h5file.create_carray('/', 'test', + atom=tb.Int32Atom((2,)), shape=(3, 3)) + if self.reopen: + self._reopen('a') + ca = self.h5file.root.test + # Assign three rows + ca[:] = [[[1, -3], [4, -5], [-7, 9]], + [[-1, 3], [-4, 5], [7, -8]], + [[-2, 3], [-5, 5], [7, -9]]] + self.assertEqual(ca.nrows, 3) + if common.verbose: + print("Third row-->", ca[2]) + self.assertTrue(common.allequal( + ca[2], np.array([[-2, 3], [-5, 5], [7, -9]], 'i4'))) + + def test03a_MDMDMD(self): + """Complex assign of a MD array in a MD CArray with a MD atom.""" + + # Create an CArray + ca = self.h5file.create_carray( + '/', 'test', atom=tb.Int32Atom((2, 4)), shape=(3, 2, 3)) + if self.reopen: + self._reopen('a') + ca = self.h5file.root.test + + # Assign values + # The shape of the atom should be added at the end of the arrays + a = np.arange(2 * 3*2*4, dtype='i4').reshape((2, 3, 2, 4)) + ca[:] = [a * 1, a*2, a*3] + self.assertEqual(ca.nrows, 3) + if common.verbose: + print("Third row-->", ca[2]) + self.assertTrue(common.allequal(ca[2], a * 3)) + + def test03b_MDMDMD(self): + """Complex assign of a MD array in a MD CArray with a MD atom (II).""" + + # Create an CArray + ca = self.h5file.create_carray( + '/', 'test', atom=tb.Int32Atom((2, 4)), shape=(2, 3, 3)) + if self.reopen: + self._reopen('a') + ca = self.h5file.root.test + + # Assign values + # The shape of the atom should be added at the end of the arrays + a = np.arange(2 * 3*3*2*4, dtype='i4').reshape((2, 3, 3, 2, 4)) + ca[:] = a + self.assertEqual(ca.nrows, 2) + if common.verbose: + print("Third row-->", ca[:, 2, ...]) + self.assertTrue(common.allequal(ca[:, 2, ...], a[:, 2, ...])) + + def test03c_MDMDMD(self): + """Complex assign of a MD array in a MD CArray with a MD atom (III).""" + + # Create an CArray + ca = self.h5file.create_carray( + '/', 'test', atom=tb.Int32Atom((2, 4)), shape=(3, 1, 2)) + if self.reopen: + self._reopen('a') + ca = self.h5file.root.test + + # Assign values + # The shape of the atom should be added at the end of the arrays + a = np.arange(3 * 1*2*2*4, dtype='i4').reshape((3, 1, 2, 2, 4)) + ca[:] = a + self.assertEqual(ca.nrows, 3) + if common.verbose: + print("Second row-->", ca[:, :, 1, ...]) + self.assertTrue(common.allequal(ca[:, :, 1, ...], a[:, :, 1, ...])) + + +class MDAtomNoReopen(MDAtomTestCase): + reopen = False + + +class MDAtomReopen(MDAtomTestCase): + reopen = True + + +# Test for building very large MD atoms without defaults. Ticket #211. +class MDLargeAtomTestCase(common.TempFileMixin, common.PyTablesTestCase): + + def test01_create(self): + """Create a CArray with a very large MD atom.""" + + N = 2**16 # 4x larger than maximum object header size (64 KB) + ca = self.h5file.create_carray('/', 'test', + atom=tb.Int32Atom(shape=N), shape=(1,)) + if self.reopen: + self._reopen('a') + ca = self.h5file.root.test + + # Check the value + if common.verbose: + print("First row-->", ca[0]) + self.assertTrue(common.allequal(ca[0], np.zeros(N, 'i4'))) + + +class MDLargeAtomNoReopen(MDLargeAtomTestCase): + reopen = False + + +class MDLargeAtomReopen(MDLargeAtomTestCase): + reopen = True + + +class AccessClosedTestCase(common.TempFileMixin, common.PyTablesTestCase): + + def setUp(self): + super().setUp() + self.array = self.h5file.create_carray(self.h5file.root, 'array', + atom=tb.Int32Atom(), + shape=(10, 10)) + self.array[...] = np.zeros((10, 10)) + + def test_read(self): + self.h5file.close() + self.assertRaises(tb.ClosedNodeError, self.array.read) + + def test_getitem(self): + self.h5file.close() + self.assertRaises(tb.ClosedNodeError, self.array.__getitem__, 0) + + def test_setitem(self): + self.h5file.close() + self.assertRaises(tb.ClosedNodeError, self.array.__setitem__, 0, 0) + + +class TestCreateCArrayArgs(common.TempFileMixin, common.PyTablesTestCase): + obj = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) + where = '/' + name = 'carray' + atom = tb.Atom.from_dtype(obj.dtype) + shape = obj.shape + title = 'title' + filters = None + chunkshape = (1, 2) + byteorder = None + createparents = False + + def test_positional_args_01(self): + self.h5file.create_carray(self.where, self.name, + self.atom, self.shape, + self.title, self.filters, self.chunkshape) + self.h5file.close() + + self.h5file = tb.open_file(self.h5fname) + ptarr = self.h5file.get_node(self.where, self.name) + nparr = ptarr.read() + + self.assertEqual(ptarr.title, self.title) + self.assertEqual(ptarr.shape, self.shape) + self.assertEqual(ptarr.atom, self.atom) + self.assertEqual(ptarr.atom.dtype, self.atom.dtype) + self.assertEqual(ptarr.chunkshape, self.chunkshape) + self.assertTrue(common.allequal(np.zeros_like(self.obj), nparr)) + + def test_positional_args_02(self): + ptarr = self.h5file.create_carray(self.where, self.name, + self.atom, self.shape, + self.title, + self.filters, self.chunkshape) + ptarr[...] = self.obj + self.h5file.close() + + self.h5file = tb.open_file(self.h5fname) + ptarr = self.h5file.get_node(self.where, self.name) + nparr = ptarr.read() + + self.assertEqual(ptarr.title, self.title) + self.assertEqual(ptarr.shape, self.shape) + self.assertEqual(ptarr.atom, self.atom) + self.assertEqual(ptarr.atom.dtype, self.atom.dtype) + self.assertEqual(ptarr.chunkshape, self.chunkshape) + self.assertTrue(common.allequal(self.obj, nparr)) + + def test_positional_args_obj(self): + self.h5file.create_carray(self.where, self.name, + None, None, + self.title, + self.filters, self.chunkshape, + self.byteorder, self.createparents, + self.obj) + self.h5file.close() + + self.h5file = tb.open_file(self.h5fname) + ptarr = self.h5file.get_node(self.where, self.name) + nparr = ptarr.read() + + self.assertEqual(ptarr.title, self.title) + self.assertEqual(ptarr.shape, self.shape) + self.assertEqual(ptarr.atom, self.atom) + self.assertEqual(ptarr.atom.dtype, self.atom.dtype) + self.assertEqual(ptarr.chunkshape, self.chunkshape) + self.assertTrue(common.allequal(self.obj, nparr)) + + def test_kwargs_obj(self): + self.h5file.create_carray(self.where, self.name, title=self.title, + chunkshape=self.chunkshape, + obj=self.obj) + self.h5file.close() + + self.h5file = tb.open_file(self.h5fname) + ptarr = self.h5file.get_node(self.where, self.name) + nparr = ptarr.read() + + self.assertEqual(ptarr.title, self.title) + self.assertEqual(ptarr.shape, self.shape) + self.assertEqual(ptarr.atom, self.atom) + self.assertEqual(ptarr.atom.dtype, self.atom.dtype) + self.assertEqual(ptarr.chunkshape, self.chunkshape) + self.assertTrue(common.allequal(self.obj, nparr)) + + def test_kwargs_atom_shape_01(self): + ptarr = self.h5file.create_carray(self.where, self.name, + title=self.title, + chunkshape=self.chunkshape, + atom=self.atom, shape=self.shape) + ptarr[...] = self.obj + self.h5file.close() + + self.h5file = tb.open_file(self.h5fname) + ptarr = self.h5file.get_node(self.where, self.name) + nparr = ptarr.read() + + self.assertEqual(ptarr.title, self.title) + self.assertEqual(ptarr.shape, self.shape) + self.assertEqual(ptarr.atom, self.atom) + self.assertEqual(ptarr.atom.dtype, self.atom.dtype) + self.assertEqual(ptarr.chunkshape, self.chunkshape) + self.assertTrue(common.allequal(self.obj, nparr)) + + def test_kwargs_atom_shape_02(self): + ptarr = self.h5file.create_carray(self.where, self.name, + title=self.title, + chunkshape=self.chunkshape, + atom=self.atom, shape=self.shape) + # ptarr[...] = self.obj + self.h5file.close() + + self.h5file = tb.open_file(self.h5fname) + ptarr = self.h5file.get_node(self.where, self.name) + nparr = ptarr.read() + + self.assertEqual(ptarr.title, self.title) + self.assertEqual(ptarr.shape, self.shape) + self.assertEqual(ptarr.atom, self.atom) + self.assertEqual(ptarr.atom.dtype, self.atom.dtype) + self.assertEqual(ptarr.chunkshape, self.chunkshape) + self.assertTrue(common.allequal(np.zeros_like(self.obj), nparr)) + + def test_kwargs_obj_atom(self): + ptarr = self.h5file.create_carray(self.where, self.name, + title=self.title, + chunkshape=self.chunkshape, + obj=self.obj, + atom=self.atom) + self.h5file.close() + + self.h5file = tb.open_file(self.h5fname) + ptarr = self.h5file.get_node(self.where, self.name) + nparr = ptarr.read() + + self.assertEqual(ptarr.title, self.title) + self.assertEqual(ptarr.shape, self.shape) + self.assertEqual(ptarr.atom, self.atom) + self.assertEqual(ptarr.atom.dtype, self.atom.dtype) + self.assertEqual(ptarr.chunkshape, self.chunkshape) + self.assertTrue(common.allequal(self.obj, nparr)) + + def test_kwargs_obj_shape(self): + ptarr = self.h5file.create_carray(self.where, self.name, + title=self.title, + chunkshape=self.chunkshape, + obj=self.obj, + shape=self.shape) + self.h5file.close() + + self.h5file = tb.open_file(self.h5fname) + ptarr = self.h5file.get_node(self.where, self.name) + nparr = ptarr.read() + + self.assertEqual(ptarr.title, self.title) + self.assertEqual(ptarr.shape, self.shape) + self.assertEqual(ptarr.atom, self.atom) + self.assertEqual(ptarr.atom.dtype, self.atom.dtype) + self.assertEqual(ptarr.chunkshape, self.chunkshape) + self.assertTrue(common.allequal(self.obj, nparr)) + + def test_kwargs_obj_atom_shape(self): + ptarr = self.h5file.create_carray(self.where, self.name, + title=self.title, + chunkshape=self.chunkshape, + obj=self.obj, + atom=self.atom, + shape=self.shape) + self.h5file.close() + + self.h5file = tb.open_file(self.h5fname) + ptarr = self.h5file.get_node(self.where, self.name) + nparr = ptarr.read() + + self.assertEqual(ptarr.title, self.title) + self.assertEqual(ptarr.shape, self.shape) + self.assertEqual(ptarr.atom, self.atom) + self.assertEqual(ptarr.atom.dtype, self.atom.dtype) + self.assertEqual(ptarr.chunkshape, self.chunkshape) + self.assertTrue(common.allequal(self.obj, nparr)) + + def test_kwargs_obj_atom_error(self): + atom = tb.Atom.from_dtype(np.dtype('complex')) + # shape = self.shape + self.shape + self.assertRaises(TypeError, + self.h5file.create_carray, + self.where, + self.name, + title=self.title, + obj=self.obj, + atom=atom) + + def test_kwargs_obj_shape_error(self): + # atom = Atom.from_dtype(numpy.dtype('complex')) + shape = self.shape + self.shape + self.assertRaises(TypeError, + self.h5file.create_carray, + self.where, + self.name, + title=self.title, + obj=self.obj, + shape=shape) + + def test_kwargs_obj_atom_shape_error_01(self): + atom = tb.Atom.from_dtype(np.dtype('complex')) + # shape = self.shape + self.shape + self.assertRaises(TypeError, + self.h5file.create_carray, + self.where, + self.name, + title=self.title, + obj=self.obj, + atom=atom, + shape=self.shape) + + def test_kwargs_obj_atom_shape_error_02(self): + # atom = Atom.from_dtype(numpy.dtype('complex')) + shape = self.shape + self.shape + self.assertRaises(TypeError, + self.h5file.create_carray, + self.where, + self.name, + title=self.title, + obj=self.obj, + atom=self.atom, + shape=shape) + + def test_kwargs_obj_atom_shape_error_03(self): + atom = tb.Atom.from_dtype(np.dtype('complex')) + shape = self.shape + self.shape + self.assertRaises(TypeError, + self.h5file.create_carray, + self.where, + self.name, + title=self.title, + obj=self.obj, + atom=atom, + shape=shape) + + +def suite(): + theSuite = common.unittest.TestSuite() + niter = 1 + # common.heavy = 1 # uncomment this only for testing purposes + + # theSuite.addTest(unittest.makeSuite(BasicTestCase)) + for n in range(niter): + theSuite.addTest(common.unittest.makeSuite(BasicWriteTestCase)) + theSuite.addTest(common.unittest.makeSuite(BasicWrite2TestCase)) + theSuite.addTest(common.unittest.makeSuite(BasicWrite3TestCase)) + theSuite.addTest(common.unittest.makeSuite(BasicWrite4TestCase)) + theSuite.addTest(common.unittest.makeSuite(BasicWrite5TestCase)) + theSuite.addTest(common.unittest.makeSuite(BasicWrite6TestCase)) + theSuite.addTest(common.unittest.makeSuite(BasicWrite7TestCase)) + theSuite.addTest(common.unittest.makeSuite(BasicWrite8TestCase)) + theSuite.addTest(common.unittest.makeSuite(EmptyCArrayTestCase)) + theSuite.addTest(common.unittest.makeSuite(EmptyCArray2TestCase)) + theSuite.addTest(common.unittest.makeSuite(SlicesCArrayTestCase)) + theSuite.addTest(common.unittest.makeSuite(Slices2CArrayTestCase)) + theSuite.addTest(common.unittest.makeSuite(EllipsisCArrayTestCase)) + theSuite.addTest(common.unittest.makeSuite(Ellipsis2CArrayTestCase)) + theSuite.addTest(common.unittest.makeSuite(Ellipsis3CArrayTestCase)) + theSuite.addTest(common.unittest.makeSuite(ZlibComprTestCase)) + theSuite.addTest(common.unittest.makeSuite(ZlibShuffleTestCase)) + theSuite.addTest(common.unittest.makeSuite(BloscComprTestCase)) + theSuite.addTest(common.unittest.makeSuite(BloscShuffleTestCase)) + theSuite.addTest(common.unittest.makeSuite(BloscBitShuffleTestCase)) + theSuite.addTest(common.unittest.makeSuite(BloscFletcherTestCase)) + theSuite.addTest(common.unittest.makeSuite(BloscBloscLZTestCase)) + theSuite.addTest(common.unittest.makeSuite(BloscLZ4TestCase)) + theSuite.addTest(common.unittest.makeSuite(BloscLZ4HCTestCase)) + theSuite.addTest(common.unittest.makeSuite(BloscSnappyTestCase)) + theSuite.addTest(common.unittest.makeSuite(BloscZlibTestCase)) + theSuite.addTest(common.unittest.makeSuite(BloscZstdTestCase)) + theSuite.addTest(common.unittest.makeSuite(LZOComprTestCase)) + theSuite.addTest(common.unittest.makeSuite(LZOShuffleTestCase)) + theSuite.addTest(common.unittest.makeSuite(Bzip2ComprTestCase)) + theSuite.addTest(common.unittest.makeSuite(Bzip2ShuffleTestCase)) + theSuite.addTest(common.unittest.makeSuite(FloatTypeTestCase)) + theSuite.addTest(common.unittest.makeSuite(ComplexTypeTestCase)) + theSuite.addTest(common.unittest.makeSuite(StringTestCase)) + theSuite.addTest(common.unittest.makeSuite(String2TestCase)) + theSuite.addTest(common.unittest.makeSuite(StringComprTestCase)) + theSuite.addTest(common.unittest.makeSuite(Int8TestCase)) + theSuite.addTest(common.unittest.makeSuite(Int16TestCase)) + theSuite.addTest(common.unittest.makeSuite(Int32TestCase)) + theSuite.addTest(common.unittest.makeSuite(Float16TestCase)) + theSuite.addTest(common.unittest.makeSuite(Float32TestCase)) + theSuite.addTest(common.unittest.makeSuite(Float64TestCase)) + theSuite.addTest(common.unittest.makeSuite(Float96TestCase)) + theSuite.addTest(common.unittest.makeSuite(Float128TestCase)) + theSuite.addTest(common.unittest.makeSuite(Complex64TestCase)) + theSuite.addTest(common.unittest.makeSuite(Complex128TestCase)) + theSuite.addTest(common.unittest.makeSuite(Complex192TestCase)) + theSuite.addTest(common.unittest.makeSuite(Complex256TestCase)) + theSuite.addTest(common.unittest.makeSuite(ComprTestCase)) + theSuite.addTest(common.unittest.makeSuite(OffsetStrideTestCase)) + theSuite.addTest(common.unittest.makeSuite(Fletcher32TestCase)) + theSuite.addTest(common.unittest.makeSuite(AllFiltersTestCase)) + theSuite.addTest(common.unittest.makeSuite(ReadOutArgumentTests)) + theSuite.addTest(common.unittest.makeSuite( + SizeOnDiskInMemoryPropertyTestCase)) + theSuite.addTest(common.unittest.makeSuite(CloseCopyTestCase)) + theSuite.addTest(common.unittest.makeSuite(OpenCopyTestCase)) + theSuite.addTest(common.unittest.makeSuite(CopyIndex1TestCase)) + theSuite.addTest(common.unittest.makeSuite(CopyIndex2TestCase)) + theSuite.addTest(common.unittest.makeSuite(CopyIndex3TestCase)) + theSuite.addTest(common.unittest.makeSuite(CopyIndex4TestCase)) + theSuite.addTest(common.unittest.makeSuite(CopyIndex5TestCase)) + theSuite.addTest(common.unittest.makeSuite(BigArrayTestCase)) + theSuite.addTest(common.unittest.makeSuite(DfltAtomNoReopen)) + theSuite.addTest(common.unittest.makeSuite(DfltAtomReopen)) + theSuite.addTest(common.unittest.makeSuite(AtomDefaultReprNoReopen)) + theSuite.addTest(common.unittest.makeSuite(AtomDefaultReprReopen)) + theSuite.addTest(common.unittest.makeSuite(TruncateTestCase)) + theSuite.addTest(common.unittest.makeSuite(MDAtomNoReopen)) + theSuite.addTest(common.unittest.makeSuite(MDAtomReopen)) + theSuite.addTest(common.unittest.makeSuite(MDLargeAtomNoReopen)) + theSuite.addTest(common.unittest.makeSuite(MDLargeAtomReopen)) + theSuite.addTest(common.unittest.makeSuite(AccessClosedTestCase)) + theSuite.addTest(common.unittest.makeSuite(TestCreateCArrayArgs)) + if common.heavy: + theSuite.addTest(common.unittest.makeSuite(Slices3CArrayTestCase)) + theSuite.addTest(common.unittest.makeSuite(Slices4CArrayTestCase)) + theSuite.addTest(common.unittest.makeSuite(Ellipsis4CArrayTestCase)) + theSuite.addTest(common.unittest.makeSuite(Ellipsis5CArrayTestCase)) + theSuite.addTest(common.unittest.makeSuite(Ellipsis6CArrayTestCase)) + theSuite.addTest(common.unittest.makeSuite(Ellipsis7CArrayTestCase)) + theSuite.addTest(common.unittest.makeSuite(MD3WriteTestCase)) + theSuite.addTest(common.unittest.makeSuite(MD5WriteTestCase)) + theSuite.addTest(common.unittest.makeSuite(MD6WriteTestCase)) + theSuite.addTest(common.unittest.makeSuite(MD7WriteTestCase)) + theSuite.addTest(common.unittest.makeSuite(MD10WriteTestCase)) + theSuite.addTest(common.unittest.makeSuite(CopyIndex6TestCase)) + theSuite.addTest(common.unittest.makeSuite(CopyIndex7TestCase)) + theSuite.addTest(common.unittest.makeSuite(CopyIndex8TestCase)) + theSuite.addTest(common.unittest.makeSuite(CopyIndex9TestCase)) + theSuite.addTest(common.unittest.makeSuite(CopyIndex10TestCase)) + theSuite.addTest(common.unittest.makeSuite(CopyIndex11TestCase)) + theSuite.addTest(common.unittest.makeSuite(CopyIndex12TestCase)) + theSuite.addTest(common.unittest.makeSuite(Rows64bitsTestCase1)) + theSuite.addTest(common.unittest.makeSuite(Rows64bitsTestCase2)) + + return theSuite + + +if __name__ == '__main__': + common.parse_argv(sys.argv) + common.print_versions() + common.unittest.main(defaultTest='suite') diff --git a/tables/tests/test_create.py b/tables/tests/test_create.py new file mode 100644 index 0000000..5803c05 --- /dev/null +++ b/tables/tests/test_create.py @@ -0,0 +1,2603 @@ +"""This test unit checks object creation funtions, like open_file, +create_table, create_array or create_group. + +It also checks: + +- name identifiers in tree objects +- title character limit for objects (255) +- limit in number in table fields (255) + +""" + +import sys +import hashlib +import tempfile +import warnings +from pathlib import Path +from packaging.version import Version + +import numpy as np + +import tables as tb +from tables.tests import common + + +class Record(tb.IsDescription): + var1 = tb.StringCol(itemsize=4) # 4-character String + var2 = tb.IntCol() # integer + var3 = tb.Int16Col() # short integer + var4 = tb.FloatCol() # double (double-precision) + var5 = tb.Float32Col() # float (single-precision) + + +class CreateTestCase(common.TempFileMixin, common.PyTablesTestCase): + title = "This is the table title" + expectedrows = 100 + maxshort = 2 ** 15 + maxint = 2_147_483_648 # (2 ** 31) + compress = 0 + + def setUp(self): + super().setUp() + + # Create an instance of HDF5 Table + self.root = self.h5file.root + + # Create a table object + self.table = self.h5file.create_table(self.root, 'atable', + Record, "Table title") + # Create an array object + self.array = self.h5file.create_array(self.root, 'anarray', + [1], "Array title") + # Create a group object + self.group = self.h5file.create_group(self.root, 'agroup', + "Group title") + + def test00_isClass(self): + """Testing table creation.""" + + self.assertIsInstance(self.table, tb.Table) + self.assertIsInstance(self.array, tb.Array) + self.assertIsInstance(self.array, tb.Leaf) + self.assertIsInstance(self.group, tb.Group) + + def test01_overwriteNode(self): + """Checking protection against node overwriting.""" + + try: + self.array = self.h5file.create_array(self.root, 'anarray', + [1], "Array title") + except tb.NodeError: + if common.verbose: + (type, value, traceback) = sys.exc_info() + print("\nGreat!, the next NameError was catched!") + print(value) + else: + self.fail("expected a tables.NodeError") + + def test02_syntaxname(self): + """Checking syntax in object tree names.""" + + with self.assertWarns(tb.NaturalNameWarning): + self.array = self.h5file.create_array(self.root, ' array', + [1], "Array title") + + # another name error + with self.assertWarns(tb.NaturalNameWarning): + self.array = self.h5file.create_array(self.root, '$array', + [1], "Array title") + + # Finally, test a reserved word + with self.assertWarns(tb.NaturalNameWarning): + self.array = self.h5file.create_array(self.root, 'for', + [1], "Array title") + + def test03a_titleAttr(self): + """Checking the self.title attr in nodes.""" + + # Close the opened file to destroy the object tree + self._reopen() + + # Now, test that self.title exists and is correct in all the nodes + self.assertEqual(self.h5file.root.agroup._v_title, "Group title") + self.assertEqual(self.h5file.root.atable.title, "Table title") + self.assertEqual(self.h5file.root.anarray.title, "Array title") + + def test03b_titleLength(self): + """Checking large title character length limit (1023)""" + + titlelength = 1023 + # Try to put a very long title on a group object + group = self.h5file.create_group(self.root, 'group', "t" * titlelength) + self.assertEqual(group._v_title, "t" * titlelength) + self.assertEqual(group._f_getattr('TITLE'), "t" * titlelength) + + # Now, try with a table object + table = self.h5file.create_table(self.root, 'table', + Record, "t" * titlelength) + self.assertEqual(table.title, "t" * titlelength) + self.assertEqual(table.get_attr("TITLE"), "t" * titlelength) + + # Finally, try with an Array object + arr = self.h5file.create_array(self.root, 'arr', + [1], "t" * titlelength) + self.assertEqual(arr.title, "t" * titlelength) + self.assertEqual(arr.get_attr("TITLE"), "t" * titlelength) + + def test04_maxFields(self): + """Checking a large number of fields in tables""" + + # The number of fields for a table + varnumber = tb.parameters.MAX_COLUMNS + + varnames = [] + for i in range(varnumber): + varnames.append('int%d' % i) + + # Build a dictionary with the types as values and varnames as keys + recordDict = {} + i = 0 + for varname in varnames: + recordDict[varname] = tb.Col.from_type("int32", dflt=1, pos=i) + i += 1 + # Append this entry to indicate the alignment! + recordDict['_v_align'] = "=" + table = self.h5file.create_table(self.root, 'table', + recordDict, "MetaRecord instance") + row = table.row + listrows = [] + # Write 10 records + for j in range(10): + rowlist = [] + for i in range(len(table.colnames)): + row[varnames[i]] = i * j + rowlist.append(i * j) + + row.append() + listrows.append(tuple(rowlist)) + + # write data on disk + table.flush() + + # Read all the data as a list + listout = table.read().tolist() + + # Compare the input rowlist and output row list. They should + # be equal. + if common.verbose: + print("Original row list:", listrows[-1]) + print("Retrieved row list:", listout[-1]) + self.assertEqual(listrows, listout) + + # The next limitation has been released. A warning is still there, though + def test05_maxFieldsExceeded(self): + """Checking an excess of the maximum number of fields in tables""" + + # The number of fields for a table + varnumber = tb.parameters.MAX_COLUMNS + 1 + + varnames = [] + for i in range(varnumber): + varnames.append('int%d' % i) + + # Build a dictionary with the types as values and varnames as keys + recordDict = {} + i = 0 + for varname in varnames: + recordDict[varname] = tb.Col.from_type("int32", dflt=1) + i += 1 + + # Now, create a table with this record object + # This way of creating node objects has been deprecated + # table = Table(recordDict, "MetaRecord instance") + + # Attach the table to object tree + warnings.filterwarnings("error", category=tb.PerformanceWarning) + # Here, a tables.PerformanceWarning should be raised! + try: + self.h5file.create_table(self.root, 'table', + recordDict, "MetaRecord instance") + except tb.PerformanceWarning: + if common.verbose: + (type, value, traceback) = sys.exc_info() + print("\nGreat!, the next PerformanceWarning was catched!") + print(value) + else: + self.fail("expected an tables.PerformanceWarning") + # Reset the warning + warnings.filterwarnings("default", category=tb.PerformanceWarning) + + # The next limitation has been released + def _test06_maxColumnNameLengthExceeded(self): + """Checking an excess (256) of the maximum length in column names""" + + # Build a dictionary with the types as values and varnames as keys + recordDict = {} + recordDict["a" * 255] = tb.IntCol(dflt=1) + recordDict["b" * 256] = tb.IntCol(dflt=1) # Should raise a ValueError + + # Now, create a table with this record object + # This way of creating node objects has been deprecated + table = tb.Table(recordDict, "MetaRecord instance") + self.assertIsNotNone(table) + + # Attach the table to object tree + # Here, ValueError should be raised! + with self.assertRaises(ValueError): + self.h5file.create_table(self.root, 'table', + recordDict, "MetaRecord instance") + + def test06_noMaxColumnNameLength(self): + """Checking unlimited length in column names""" + + # Build a dictionary with the types as values and varnames as keys + recordDict = {} + recordDict["a" * 255] = tb.IntCol(dflt=1, pos=0) + recordDict["b" * 1024] = tb.IntCol(dflt=1, pos=1) # Should work well + + # Attach the table to object tree + # Here, IndexError should be raised! + table = self.h5file.create_table(self.root, 'table', + recordDict, "MetaRecord instance") + self.assertEqual(table.colnames[0], "a" * 255) + self.assertEqual(table.colnames[1], "b" * 1024) + + +class Record2(tb.IsDescription): + var1 = tb.StringCol(itemsize=4) # 4-character String + var2 = tb.IntCol() # integer + var3 = tb.Int16Col() # short integer + + +class FiltersTreeTestCase(common.TempFileMixin, common.PyTablesTestCase): + title = "A title" + nrows = 10 + + def setUp(self): + super().setUp() + self.populateFile() + + def populateFile(self): + group = self.h5file.root + # Create a tree with three levels of depth + for j in range(5): + # Create a table + table = self.h5file.create_table(group, 'table1', Record2, + title=self.title, + filters=None) + # Get the record object associated with the new table + d = table.row + # Fill the table + for i in range(self.nrows): + d['var1'] = '%04d' % (self.nrows - i) + d['var2'] = i + d['var3'] = i * 2 + d.append() # This injects the Record values + # Flush the buffer for this table + table.flush() + + # Create a couple of arrays in each group + var1List = [x['var1'] for x in table.iterrows()] + var3List = [x['var3'] for x in table.iterrows()] + + self.h5file.create_array(group, 'array1', var1List, "col 1") + self.h5file.create_array(group, 'array2', var3List, "col 3") + + # Create a couple of EArrays as well + ea1 = self.h5file.create_earray(group, 'earray1', + tb.StringAtom(itemsize=4), (0,), + "col 1") + ea2 = self.h5file.create_earray(group, 'earray2', + tb.Int16Atom(), (0,), "col 3") + # And fill them with some values + ea1.append(var1List) + ea2.append(var3List) + + # Finally a couple of VLArrays too + vla1 = self.h5file.create_vlarray( + group, 'vlarray1', tb.StringAtom(itemsize=4), "col 1") + vla2 = self.h5file.create_vlarray( + group, 'vlarray2', tb.Int16Atom(), "col 3") + # And fill them with some values + vla1.append(var1List) + vla2.append(var3List) + + # Create a new group (descendant of group) + if j == 1: # The second level + group2 = self.h5file.create_group(group, 'group' + str(j), + filters=self.gfilters) + elif j == 2: # third level + group2 = self.h5file.create_group(group, 'group' + str(j)) + else: # The rest of levels + group2 = self.h5file.create_group(group, 'group' + str(j), + filters=self.filters) + # Iterate over this new group (group2) + group = group2 + + def test00_checkFilters(self): + """Checking inheritance of filters on trees (open file version)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test00_checkFilters..." % + self.__class__.__name__) + + # First level check + if common.verbose: + print("Test filter:", repr(self.filters)) + print("Filters in file:", repr(self.h5file.filters)) + + if self.filters is None: + filters = tb.Filters() + else: + filters = self.filters + self.assertEqual(repr(filters), repr(self.h5file.filters)) + + # The next nodes have to have the same filter properties as + # self.filters + nodelist = [ + '/table1', '/group0/earray1', '/group0/vlarray1', '/group0', + ] + for node in nodelist: + obj = self.h5file.get_node(node) + if isinstance(obj, tb.Group): + self.assertEqual(repr(filters), repr(obj._v_filters)) + else: + self.assertEqual(repr(filters), repr(obj.filters)) + + # Second and third level check + group1 = self.h5file.root.group0.group1 + if self.gfilters is None: + if self.filters is None: + gfilters = tb.Filters() + else: + gfilters = self.filters + else: + gfilters = self.gfilters + if common.verbose: + print("Test gfilter:", repr(gfilters)) + print("Filters in file:", repr(group1._v_filters)) + + self.assertEqual(repr(gfilters), repr(group1._v_filters)) + + # The next nodes have to have the same filter properties as + # gfilters + nodelist = [ + '/group0/group1', '/group0/group1/earray1', + '/group0/group1/vlarray1', + '/group0/group1/table1', '/group0/group1/group2/table1', + ] + for node in nodelist: + obj = self.h5file.get_node(node) + if isinstance(obj, tb.Group): + self.assertEqual(repr(gfilters), repr(obj._v_filters)) + else: + self.assertEqual(repr(gfilters), repr(obj.filters)) + + # Fourth and fifth level check + if self.filters is None: + # If None, the filters are inherited! + if self.gfilters is None: + filters = tb.Filters() + else: + filters = self.gfilters + else: + filters = self.filters + group3 = self.h5file.root.group0.group1.group2.group3 + if common.verbose: + print("Test filter:", repr(filters)) + print("Filters in file:", repr(group3._v_filters)) + + self.assertEqual(repr(filters), repr(group3._v_filters)) + + # The next nodes have to have the same filter properties as + # self.filter + nodelist = [ + '/group0/group1/group2/group3', + '/group0/group1/group2/group3/earray1', + '/group0/group1/group2/group3/vlarray1', + '/group0/group1/group2/group3/table1', + '/group0/group1/group2/group3/group4', + ] + for node in nodelist: + obj = self.h5file.get_node(node) + if isinstance(obj, tb.Group): + self.assertEqual(repr(filters), repr(obj._v_filters)) + else: + self.assertEqual(repr(filters), repr(obj.filters)) + + # Checking the special case for Arrays in which the compression + # should always be the empty Filter() + # The next nodes have to have the same filter properties as + # Filter() + nodelist = [ + '/array1', + '/group0/array1', + '/group0/group1/array1', + '/group0/group1/group2/array1', + '/group0/group1/group2/group3/array1', + ] + for node in nodelist: + obj = self.h5file.get_node(node) + self.assertEqual(repr(tb.Filters()), repr(obj.filters)) + + def test01_checkFilters(self): + """Checking inheritance of filters on trees (close file version)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test01_checkFilters..." % + self.__class__.__name__) + + # Close the file + self._reopen() + + # First level check + if self.filters is None: + filters = tb.Filters() + else: + filters = self.filters + if common.verbose: + print("Test filter:", repr(filters)) + print("Filters in file:", repr(self.h5file.filters)) + + self.assertEqual(repr(filters), repr(self.h5file.filters)) + + # The next nodes have to have the same filter properties as + # self.filters + nodelist = [ + '/table1', '/group0/earray1', '/group0/vlarray1', '/group0', + ] + for node in nodelist: + object_ = self.h5file.get_node(node) + if isinstance(object_, tb.Group): + self.assertEqual(repr(filters), repr(object_._v_filters)) + else: + self.assertEqual(repr(filters), repr(object_.filters)) + + # Second and third level check + group1 = self.h5file.root.group0.group1 + if self.gfilters is None: + if self.filters is None: + gfilters = tb.Filters() + else: + gfilters = self.filters + else: + gfilters = self.gfilters + if common.verbose: + print("Test filter:", repr(gfilters)) + print("Filters in file:", repr(group1._v_filters)) + + self.assertEqual(repr(gfilters), repr(group1._v_filters)) + + # The next nodes have to have the same filter properties as + # gfilters + nodelist = [ + '/group0/group1', '/group0/group1/earray1', + '/group0/group1/vlarray1', + '/group0/group1/table1', '/group0/group1/group2/table1', + ] + for node in nodelist: + object_ = self.h5file.get_node(node) + if isinstance(object_, tb.Group): + self.assertEqual(repr(gfilters), repr(object_._v_filters)) + else: + self.assertEqual(repr(gfilters), repr(object_.filters)) + + # Fourth and fifth level check + if self.filters is None: + if self.gfilters is None: + filters = tb.Filters() + else: + filters = self.gfilters + else: + filters = self.filters + group3 = self.h5file.root.group0.group1.group2.group3 + if common.verbose: + print("Test filter:", repr(filters)) + print("Filters in file:", repr(group3._v_filters)) + + repr(filters) == repr(group3._v_filters) + # The next nodes have to have the same filter properties as + # self.filters + nodelist = [ + '/group0/group1/group2/group3', + '/group0/group1/group2/group3/earray1', + '/group0/group1/group2/group3/vlarray1', + '/group0/group1/group2/group3/table1', + '/group0/group1/group2/group3/group4', + ] + for node in nodelist: + obj = self.h5file.get_node(node) + if isinstance(obj, tb.Group): + self.assertEqual(repr(filters), repr(obj._v_filters)) + else: + self.assertEqual(repr(filters), repr(obj.filters)) + + # Checking the special case for Arrays in which the compression + # should always be the empty Filter() + # The next nodes have to have the same filter properties as + # Filter() + nodelist = [ + '/array1', + '/group0/array1', + '/group0/group1/array1', + '/group0/group1/group2/array1', + '/group0/group1/group2/group3/array1', + ] + for node in nodelist: + obj = self.h5file.get_node(node) + self.assertEqual(repr(tb.Filters()), repr(obj.filters)) + + +class FiltersCase1(FiltersTreeTestCase): + filters = tb.Filters() + gfilters = tb.Filters(complevel=1) + open_kwargs = dict(filters=filters) + + +@common.unittest.skipIf(not common.bzip2_avail, + 'BZIP2 compression library not available') +class FiltersCase2(FiltersTreeTestCase): + filters = tb.Filters(complevel=1, complib="bzip2") + gfilters = tb.Filters(complevel=1) + open_kwargs = dict(filters=filters) + + +@common.unittest.skipIf(not common.lzo_avail, + 'LZO compression library not available') +class FiltersCase3(FiltersTreeTestCase): + filters = tb.Filters(shuffle=True, complib="zlib") + gfilters = tb.Filters(complevel=1, shuffle=False, complib="lzo") + open_kwargs = dict(filters=filters) + + +class FiltersCase4(FiltersTreeTestCase): + filters = tb.Filters(shuffle=True) + gfilters = tb.Filters(complevel=1, shuffle=False) + open_kwargs = dict(filters=filters) + + +class FiltersCase5(FiltersTreeTestCase): + filters = tb.Filters(fletcher32=True) + gfilters = tb.Filters(complevel=1, shuffle=False) + open_kwargs = dict(filters=filters) + + +class FiltersCase6(FiltersTreeTestCase): + filters = None + gfilters = tb.Filters(complevel=1, shuffle=False) + open_kwargs = dict(filters=filters) + + +class FiltersCase7(FiltersTreeTestCase): + filters = tb.Filters(complevel=1) + gfilters = None + open_kwargs = dict(filters=filters) + + +class FiltersCase8(FiltersTreeTestCase): + filters = None + gfilters = None + open_kwargs = dict(filters=filters) + + +@common.unittest.skipIf(not common.bzip2_avail, + 'BZIP2 compression library not available') +class FiltersCase9(FiltersTreeTestCase): + filters = tb.Filters(shuffle=True, complib="zlib") + gfilters = tb.Filters(complevel=5, shuffle=True, complib="bzip2") + open_kwargs = dict(filters=filters) + + +@common.unittest.skipIf(not common.blosc_avail, + 'BLOSC compression library not available') +class FiltersCase10(FiltersTreeTestCase): + filters = tb.Filters(shuffle=False, complevel=1, complib="blosc") + gfilters = tb.Filters(complevel=5, shuffle=True, complib="blosc") + open_kwargs = dict(filters=filters) + + +@common.unittest.skipIf(not common.blosc_avail, + 'BLOSC compression library not available') +class FiltersCaseBloscBloscLZ(FiltersTreeTestCase): + filters = tb.Filters(shuffle=False, complevel=1, complib="blosc:blosclz") + gfilters = tb.Filters(complevel=5, shuffle=True, complib="blosc:blosclz") + open_kwargs = dict(filters=filters) + + +@common.unittest.skipIf(not common.blosc_avail, + 'BLOSC compression library not available') +@common.unittest.skipIf( + 'lz4' not in tb.blosc_compressor_list(), 'lz4 required') +class FiltersCaseBloscLZ4(FiltersTreeTestCase): + def setUp(self): + self.filters = tb.Filters(shuffle=False, complevel=1, + complib="blosc:lz4") + self.gfilters = tb.Filters(complevel=5, shuffle=True, + complib="blosc:lz4") + self.open_kwargs = dict(filters=self.filters) + super().setUp() + + +@common.unittest.skipIf(not common.blosc_avail, + 'BLOSC compression library not available') +@common.unittest.skipIf( + 'lz4' not in tb.blosc_compressor_list(), 'lz4 required') +class FiltersCaseBloscLZ4HC(FiltersTreeTestCase): + def setUp(self): + self.filters = tb.Filters( + shuffle=False, complevel=1, complib="blosc:lz4hc") + self.gfilters = tb.Filters( + complevel=5, shuffle=True, complib="blosc:lz4hc") + self.open_kwargs = dict(filters=self.filters) + super().setUp() + + +@common.unittest.skipIf(not common.blosc_avail, + 'BLOSC compression library not available') +@common.unittest.skipIf('snappy' not in tb.blosc_compressor_list(), + 'snappy required') +class FiltersCaseBloscSnappy(FiltersTreeTestCase): + def setUp(self): + self.filters = tb.Filters( + shuffle=False, complevel=1, complib="blosc:snappy") + self.gfilters = tb.Filters( + complevel=5, shuffle=True, complib="blosc:snappy") + self.open_kwargs = dict(filters=self.filters) + super().setUp() + + +@common.unittest.skipIf(not common.blosc_avail, + 'BLOSC compression library not available') +@common.unittest.skipIf( + 'zlib' not in tb.blosc_compressor_list(), 'zlib required') +class FiltersCaseBloscZlib(FiltersTreeTestCase): + def setUp(self): + self.filters = tb.Filters(shuffle=False, complevel=1, + complib="blosc:zlib") + self.gfilters = tb.Filters(complevel=5, shuffle=True, + complib="blosc:zlib") + self.open_kwargs = dict(filters=self.filters) + super().setUp() + + +@common.unittest.skipIf(not common.blosc_avail, + 'BLOSC compression library not available') +@common.unittest.skipIf( + 'zstd' not in tb.blosc_compressor_list(), 'zstd required') +class FiltersCaseBloscZstd(FiltersTreeTestCase): + def setUp(self): + self.filters = tb.Filters(shuffle=False, complevel=1, + complib="blosc:zstd") + self.gfilters = tb.Filters(complevel=5, shuffle=True, + complib="blosc:zstd") + self.open_kwargs = dict(filters=self.filters) + super().setUp() + + +@common.unittest.skipIf(not common.blosc_avail, + 'BLOSC compression library not available') +@common.unittest.skipIf( + common.blosc_version < common.min_blosc_bitshuffle_version, + f'BLOSC >= {common.min_blosc_bitshuffle_version} required') +class FiltersCaseBloscBitShuffle(FiltersTreeTestCase): + filters = tb.Filters(shuffle=False, complevel=1, complib="blosc:blosclz") + gfilters = tb.Filters(complevel=5, shuffle=False, bitshuffle=True, + complib="blosc:blosclz") + open_kwargs = dict(filters=filters) + # print("version:", tables.which_lib_version("blosc")[1]) + + +class CopyGroupTestCase(common.TempFileMixin, common.PyTablesTestCase): + title = "A title" + nrows = 10 + + def setUp(self): + super().setUp() + + # Create a temporary file + self.h5fname2 = tempfile.mktemp(".h5") + + # Create the destination + self.h5file2 = tb.open_file(self.h5fname2, "w") + self.populateFile() + + def populateFile(self): + group = self.h5file.root + # Add some user attrs: + group._v_attrs.attr1 = "an string for root group" + group._v_attrs.attr2 = 124 + # Create a tree + for group_i in range(5): + for bgroup_i in range(2): + # Create a new group (brother of group) + group2 = self.h5file.create_group(group, + 'bgroup' + str(bgroup_i), + filters=None) + + # Create a table + table = self.h5file.create_table(group2, 'table1', Record2, + title=self.title, + filters=None) + # Get the record object associated with the new table + d = table.row + # Fill the table + for row_i in range(self.nrows): + d['var1'] = '%04d' % (self.nrows - row_i) + d['var2'] = row_i + d['var3'] = row_i * 2 + d.append() # This injects the Record values + # Flush the buffer for this table + table.flush() + + # Add some user attrs: + table.attrs.attr1 = "an string" + table.attrs.attr2 = 234 + + # Create a couple of arrays in each group + var1List = [x['var1'] for x in table.iterrows()] + var3List = [x['var3'] for x in table.iterrows()] + + self.h5file.create_array(group2, 'array1', var1List, "col 1") + self.h5file.create_array(group2, 'array2', var3List, "col 3") + + # Create a couple of EArrays as well + ea1 = self.h5file.create_earray(group2, 'earray1', + tb.StringAtom(itemsize=4), + (0,), "col 1") + ea2 = self.h5file.create_earray(group2, 'earray2', + tb.Int16Atom(), (0,), "col 3") + # Add some user attrs: + ea1.attrs.attr1 = "an string for earray" + ea2.attrs.attr2 = 123 + # And fill them with some values + ea1.append(var1List) + ea2.append(var3List) + + # Create a new group (descendant of group) + group3 = self.h5file.create_group(group, 'group' + str(group_i), + filters=None) + # Iterate over this new group (group3) + group = group3 + # Add some user attrs: + group._v_attrs.attr1 = "an string for group" + group._v_attrs.attr2 = 124 + + def tearDown(self): + # Close the file + if self.h5file2.isopen: + self.h5file2.close() + Path(self.h5fname2).unlink() + + super().tearDown() + + def test00_nonRecursive(self): + """Checking non-recursive copy of a Group""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test00_nonRecursive..." % + self.__class__.__name__) + + # Copy a group non-recursively + srcgroup = self.h5file.root.group0.group1 + # srcgroup._f_copy_children(self.h5file2.root, recursive=False, + # filters=self.filters) + self.h5file.copy_children(srcgroup, self.h5file2.root, + recursive=False, filters=self.filters) + if self.close: + # Close the destination file + self.h5file2.close() + # And open it again + self.h5file2 = tb.open_file(self.h5fname2, "r") + + # Check that the copy has been done correctly + dstgroup = self.h5file2.root + nodelist1 = list(srcgroup._v_children) + nodelist2 = list(dstgroup._v_children) + # Sort the lists + nodelist1.sort() + nodelist2.sort() + if common.verbose: + print("The origin node list -->", nodelist1) + print("The copied node list -->", nodelist2) + self.assertEqual(srcgroup._v_nchildren, dstgroup._v_nchildren) + self.assertEqual(nodelist1, nodelist2) + + def test01_nonRecursiveAttrs(self): + """Checking non-recursive copy of a Group (attributes copied)""" + + if common.verbose: + print('\n', '-=' * 30) + print(f"Running {self.__class__.__name__}" + f".test01_nonRecursiveAttrs...") + + # Copy a group non-recursively with attrs + srcgroup = self.h5file.root.group0.group1 + srcgroup._f_copy_children(self.h5file2.root, + recursive=False, + filters=self.filters, + copyuserattrs=1) + if self.close: + # Close the destination file + self.h5file2.close() + # And open it again + self.h5file2 = tb.open_file(self.h5fname2, "r") + + # Check that the copy has been done correctly + dstgroup = self.h5file2.root + for srcnode in srcgroup: + dstnode = getattr(dstgroup, srcnode._v_name) + if isinstance(srcnode, tb.Group): + srcattrs = srcnode._v_attrs + srcattrskeys = srcattrs._f_list("all") + dstattrs = dstnode._v_attrs + dstattrskeys = dstattrs._f_list("all") + else: + srcattrs = srcnode.attrs + srcattrskeys = srcattrs._f_list("all") + dstattrs = dstnode.attrs + dstattrskeys = dstattrs._f_list("all") + + # Filters may differ, do not take into account + if self.filters is not None: + dstattrskeys.remove('FILTERS') + + # These lists should already be ordered + if common.verbose: + print(f"srcattrskeys for node {srcnode._v_name}: " + f"{srcattrskeys}") + print(f"dstattrskeys for node {dstnode._v_name}: " + f"{dstattrskeys}") + self.assertEqual(srcattrskeys, dstattrskeys) + if common.verbose: + print("The attrs names has been copied correctly") + + # Now, for the contents of attributes + for srcattrname in srcattrskeys: + srcattrvalue = str(getattr(srcattrs, srcattrname)) + dstattrvalue = str(getattr(dstattrs, srcattrname)) + self.assertEqual(srcattrvalue, dstattrvalue) + if self.filters is not None: + self.assertEqual(dstattrs.FILTERS, self.filters) + + if common.verbose: + print("The attrs contents has been copied correctly") + + def test02_Recursive(self): + """Checking recursive copy of a Group""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test02_Recursive..." % self.__class__.__name__) + + # Create the destination node + group = self.h5file2.root + for groupname in self.dstnode.split("/"): + if groupname: + group = self.h5file2.create_group(group, groupname) + dstgroup = self.h5file2.get_node(self.dstnode) + + # Copy a group non-recursively + srcgroup = self.h5file.get_node(self.srcnode) + self.h5file.copy_children(srcgroup, dstgroup, + recursive=True, + filters=self.filters) + lenSrcGroup = len(srcgroup._v_pathname) + if lenSrcGroup == 1: + lenSrcGroup = 0 # Case where srcgroup == "/" + if self.close: + # Close the destination file + self.h5file2.close() + # And open it again + self.h5file2 = tb.open_file(self.h5fname2, "r") + dstgroup = self.h5file2.get_node(self.dstnode) + + # Check that the copy has been done correctly + lenDstGroup = len(dstgroup._v_pathname) + if lenDstGroup == 1: + lenDstGroup = 0 # Case where dstgroup == "/" + first = 1 + nodelist1 = [] + for node in srcgroup._f_walknodes(): + if first: + # skip the first group + first = 0 + continue + nodelist1.append(node._v_pathname[lenSrcGroup:]) + + first = 1 + nodelist2 = [] + for node in dstgroup._f_walknodes(): + if first: + # skip the first group + first = 0 + continue + nodelist2.append(node._v_pathname[lenDstGroup:]) + + if common.verbose: + print("The origin node list -->", nodelist1) + print("The copied node list -->", nodelist2) + self.assertEqual(nodelist1, nodelist2) + + def test03_RecursiveFilters(self): + """Checking recursive copy of a Group (cheking Filters)""" + + if common.verbose: + print('\n', '-=' * 30) + print(f"Running {self.__class__.__name__}" + f".test03_RecursiveFilters...") + + # Create the destination node + group = self.h5file2.root + for groupname in self.dstnode.split("/"): + if groupname: + group = self.h5file2.create_group(group, groupname) + dstgroup = self.h5file2.get_node(self.dstnode) + + # Copy a group non-recursively + srcgroup = self.h5file.get_node(self.srcnode) + srcgroup._f_copy_children(dstgroup, + recursive=True, + filters=self.filters) + lenSrcGroup = len(srcgroup._v_pathname) + if lenSrcGroup == 1: + lenSrcGroup = 0 # Case where srcgroup == "/" + if self.close: + # Close the destination file + self.h5file2.close() + # And open it again + self.h5file2 = tb.open_file(self.h5fname2, "r") + dstgroup = self.h5file2.get_node(self.dstnode) + + # Check that the copy has been done correctly + lenDstGroup = len(dstgroup._v_pathname) + if lenDstGroup == 1: + lenDstGroup = 0 # Case where dstgroup == "/" + first = 1 + nodelist1 = {} + for node in srcgroup._f_walknodes(): + if first: + # skip the first group + first = 0 + continue + nodelist1[node._v_name] = node._v_pathname[lenSrcGroup:] + + first = 1 + for node in dstgroup._f_walknodes(): + if first: + # skip the first group + first = 0 + continue + if isinstance(node, tb.Group): + repr(node._v_filters) == repr(nodelist1[node._v_name]) + else: + repr(node.filters) == repr(nodelist1[node._v_name]) + + +class CopyGroupCase1(CopyGroupTestCase): + close = 0 + filters = None + srcnode = '/group0/group1' + dstnode = '/' + + +class CopyGroupCase2(CopyGroupTestCase): + close = 1 + filters = None + srcnode = '/group0/group1' + dstnode = '/' + + +class CopyGroupCase3(CopyGroupTestCase): + close = 0 + filters = None + srcnode = '/group0' + dstnode = '/group2/group3' + + +class CopyGroupCase4(CopyGroupTestCase): + close = 1 + filters = tb.Filters(complevel=1) + srcnode = '/group0' + dstnode = '/group2/group3' + + +class CopyGroupCase5(CopyGroupTestCase): + close = 0 + filters = tb.Filters() + srcnode = '/' + dstnode = '/group2/group3' + + +class CopyGroupCase6(CopyGroupTestCase): + close = 1 + filters = tb.Filters(fletcher32=True) + srcnode = '/group0' + dstnode = '/group2/group3' + + +class CopyGroupCase7(CopyGroupTestCase): + close = 0 + filters = tb.Filters(complevel=1, shuffle=False) + srcnode = '/' + dstnode = '/' + + +@common.unittest.skipIf(not common.lzo_avail, + 'LZO compression library not available') +class CopyGroupCase8(CopyGroupTestCase): + close = 1 + filters = tb.Filters(complevel=1, complib="lzo") + srcnode = '/' + dstnode = '/' + + +class CopyFileTestCase(common.TempFileMixin, common.PyTablesTestCase): + title = "A title" + nrows = 10 + + def setUp(self): + super().setUp() + + # Create a temporary file + self.h5fname2 = tempfile.mktemp(".h5") + + # Create the source file + self.populateFile() + + def populateFile(self): + group = self.h5file.root + # Add some user attrs: + group._v_attrs.attr1 = "an string for root group" + group._v_attrs.attr2 = 124 + # Create a tree + for group_i in range(5): + for bgroup_i in range(2): + # Create a new group (brother of group) + group2 = self.h5file.create_group(group, + 'bgroup' + str(bgroup_i), + filters=None) + + # Create a table + table = self.h5file.create_table(group2, 'table1', Record2, + title=self.title, + filters=None) + # Get the record object associated with the new table + d = table.row + # Fill the table + for row_i in range(self.nrows): + d['var1'] = '%04d' % (self.nrows - row_i) + d['var2'] = row_i + d['var3'] = row_i * 2 + d.append() # This injects the Record values + # Flush the buffer for this table + table.flush() + + # Add some user attrs: + table.attrs.attr1 = "an string" + table.attrs.attr2 = 234 + + # Create a couple of arrays in each group + var1List = [x['var1'] for x in table.iterrows()] + var3List = [x['var3'] for x in table.iterrows()] + + self.h5file.create_array(group2, 'array1', var1List, "col 1") + self.h5file.create_array(group2, 'array2', var3List, "col 3") + + # Create a couple of EArrays as well + ea1 = self.h5file.create_earray(group2, 'earray1', + tb.StringAtom(itemsize=4), + (0,), "col 1") + ea2 = self.h5file.create_earray(group2, 'earray2', + tb.Int16Atom(), (0,), + "col 3") + # Add some user attrs: + ea1.attrs.attr1 = "an string for earray" + ea2.attrs.attr2 = 123 + # And fill them with some values + ea1.append(var1List) + ea2.append(var3List) + + # Create a new group (descendant of group) + group3 = self.h5file.create_group(group, 'group' + str(group_i), + filters=None) + # Iterate over this new group (group3) + group = group3 + # Add some user attrs: + group._v_attrs.attr1 = "an string for group" + group._v_attrs.attr2 = 124 + + def tearDown(self): + # Close the file + if hasattr(self, 'h5file2') and self.h5file2.isopen: + self.h5file2.close() + + if hasattr(self, 'h5fname2') and Path(self.h5fname2).is_file(): + Path(self.h5fname2).unlink() + + super().tearDown() + + def test00_overwrite(self): + """Checking copy of a File (overwriting file)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test00_overwrite..." % self.__class__.__name__) + + # Create a temporary file + Path(self.h5fname2).write_text('') + + # Copy the file to the destination + self.h5file.copy_file(self.h5fname2, title=self.title, + overwrite=1, + copyuserattrs=0, + filters=None) + + # Close the original file, if needed + if self.close: + self._reopen() + + # ...and open the destination file + self.h5file2 = tb.open_file(self.h5fname2, "r") + + # Check that the copy has been done correctly + srcgroup = self.h5file.root + dstgroup = self.h5file2.root + nodelist1 = list(srcgroup._v_children) + nodelist2 = list(dstgroup._v_children) + # Sort the lists + nodelist1.sort() + nodelist2.sort() + if common.verbose: + print("The origin node list -->", nodelist1) + print("The copied node list -->", nodelist2) + self.assertEqual(srcgroup._v_nchildren, dstgroup._v_nchildren) + self.assertEqual(nodelist1, nodelist2) + self.assertEqual(self.h5file2.title, self.title) + + def test00a_srcdstequal(self): + """Checking copy of a File (srcfile == dstfile)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test00a_srcdstequal..." % + self.__class__.__name__) + + # Copy the file to the destination + self.assertRaises(IOError, self.h5file.copy_file, self.h5file.filename) + + def test00b_firstclass(self): + """Checking copy of a File (first-class function)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test00b_firstclass..." % self.__class__.__name__) + + # Close the temporary file + self.h5file.close() + + # Copy the file to the destination + tb.copy_file(self.h5fname, self.h5fname2, title=self.title, + copyuserattrs=0, filters=None, overwrite=1) + + # ...and open the source and destination file + self.h5file = tb.open_file(self.h5fname, "r") + self.h5file2 = tb.open_file(self.h5fname2, "r") + + # Check that the copy has been done correctly + srcgroup = self.h5file.root + dstgroup = self.h5file2.root + nodelist1 = list(srcgroup._v_children) + nodelist2 = list(dstgroup._v_children) + + # Sort the lists + nodelist1.sort() + nodelist2.sort() + if common.verbose: + print("The origin node list -->", nodelist1) + print("The copied node list -->", nodelist2) + self.assertEqual(srcgroup._v_nchildren, dstgroup._v_nchildren) + self.assertEqual(nodelist1, nodelist2) + self.assertEqual(self.h5file2.title, self.title) + + def test01_copy(self): + """Checking copy of a File (attributes not copied)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test01_copy..." % self.__class__.__name__) + + # Copy the file to the destination + self.h5file.copy_file(self.h5fname2, title=self.title, + copyuserattrs=0, + filters=self.filters) + + # Close the original file, if needed + if self.close: + self._reopen() + + # ...and open the destination file + self.h5file2 = tb.open_file(self.h5fname2, "r") + + # Check that the copy has been done correctly + srcgroup = self.h5file.root + dstgroup = self.h5file2.root + nodelist1 = list(srcgroup._v_children) + nodelist2 = list(dstgroup._v_children) + + # Sort the lists + nodelist1.sort() + nodelist2.sort() + if common.verbose: + print("The origin node list -->", nodelist1) + print("The copied node list -->", nodelist2) + self.assertEqual(srcgroup._v_nchildren, dstgroup._v_nchildren) + self.assertEqual(nodelist1, nodelist2) + # print("_v_attrnames-->", self.h5file2.root._v_attrs._v_attrnames) + # print("--> <%s,%s>" % (self.h5file2.title, self.title)) + self.assertEqual(self.h5file2.title, self.title) + + # Check that user attributes has not been copied + for srcnode in srcgroup: + dstnode = getattr(dstgroup, srcnode._v_name) + srcattrs = srcnode._v_attrs + srcattrskeys = srcattrs._f_list("sys") + dstattrs = dstnode._v_attrs + dstattrskeys = dstattrs._f_list("all") + + # Filters may differ, do not take into account + if self.filters is not None: + dstattrskeys.remove('FILTERS') + + # These lists should already be ordered + if common.verbose: + print(f"srcattrskeys for node {srcnode._v_name}: " + f"{srcattrskeys}") + print(f"dstattrskeys for node {dstnode._v_name}: " + f"{dstattrskeys}") + self.assertEqual(srcattrskeys, dstattrskeys) + if common.verbose: + print("The attrs names has been copied correctly") + + # Now, for the contents of attributes + for srcattrname in srcattrskeys: + srcattrvalue = str(getattr(srcattrs, srcattrname)) + dstattrvalue = str(getattr(dstattrs, srcattrname)) + self.assertEqual(srcattrvalue, dstattrvalue) + if self.filters is not None: + self.assertEqual(dstattrs.FILTERS, self.filters) + + if common.verbose: + print("The attrs contents has been copied correctly") + + def test02_Attrs(self): + """Checking copy of a File (attributes copied)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test02_Attrs..." % self.__class__.__name__) + + # Copy the file to the destination + self.h5file.copy_file(self.h5fname2, title=self.title, + copyuserattrs=1, + filters=self.filters) + + # Close the original file, if needed + if self.close: + self._reopen() + + # ...and open the destination file + self.h5file2 = tb.open_file(self.h5fname2, "r") + + # Check that the copy has been done correctly + srcgroup = self.h5file.root + dstgroup = self.h5file2.root + for srcnode in srcgroup: + dstnode = getattr(dstgroup, srcnode._v_name) + srcattrs = srcnode._v_attrs + srcattrskeys = srcattrs._f_list("all") + dstattrs = dstnode._v_attrs + dstattrskeys = dstattrs._f_list("all") + # These lists should already be ordered + if common.verbose: + print(f"srcattrskeys for node {srcnode._v_name}: " + f"{srcattrskeys}") + print(f"dstattrskeys for node {dstnode._v_name}: " + f"{dstattrskeys}") + + # Filters may differ, do not take into account + if self.filters is not None: + dstattrskeys.remove('FILTERS') + self.assertEqual(srcattrskeys, dstattrskeys) + if common.verbose: + print("The attrs names has been copied correctly") + + # Now, for the contents of attributes + for srcattrname in srcattrskeys: + srcattrvalue = str(getattr(srcattrs, srcattrname)) + dstattrvalue = str(getattr(dstattrs, srcattrname)) + self.assertEqual(srcattrvalue, dstattrvalue) + if self.filters is not None: + self.assertEqual(dstattrs.FILTERS, self.filters) + + if common.verbose: + print("The attrs contents has been copied correctly") + + +class CopyFileCase1(CopyFileTestCase): + close = 0 + title = "A new title" + filters = None + + +class CopyFileCase2(CopyFileTestCase): + close = 1 + title = "A new title" + filters = None + + +class CopyFileCase3(CopyFileTestCase): + close = 0 + title = "A new title" + filters = tb.Filters(complevel=1) + + +class CopyFileCase4(CopyFileTestCase): + close = 1 + title = "A new title" + filters = tb.Filters(complevel=1) + + +class CopyFileCase5(CopyFileTestCase): + close = 0 + title = "A new title" + filters = tb.Filters(fletcher32=True) + + +class CopyFileCase6(CopyFileTestCase): + close = 1 + title = "A new title" + filters = tb.Filters(fletcher32=True) + + +class CopyFileCase7(CopyFileTestCase): + close = 0 + title = "A new title" + filters = tb.Filters(complevel=1, complib="lzo") + + +class CopyFileCase8(CopyFileTestCase): + close = 1 + title = "A new title" + filters = tb.Filters(complevel=1, complib="lzo") + + +class CopyFileCase10(common.TempFileMixin, common.PyTablesTestCase): + + def test01_notoverwrite(self): + """Checking copy of a File (checking not overwriting)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test01_notoverwrite..." % + self.__class__.__name__) + + # Create two empty files: + self.h5fname2 = tempfile.mktemp(".h5") + self.h5file2 = tb.open_file(self.h5fname2, "w") + self.h5file2.close() # close the second one + + try: + # Copy the first into the second + self.assertRaises( + IOError, self.h5file.copy_file, self.h5fname2, overwrite=False) + finally: + # Delete files + Path(self.h5fname2).unlink() + + +class GroupFiltersTestCase(common.TempFileMixin, common.PyTablesTestCase): + filters = tb.Filters(complevel=4) # something non-default + + def setUp(self): + super().setUp() + + atom, shape = tb.IntAtom(), (1, 1) + create_group = self.h5file.create_group + create_carray = self.h5file.create_carray + + create_group('/', 'implicit_no') + create_group('/implicit_no', 'implicit_no') + create_carray('/implicit_no/implicit_no', 'implicit_no', + atom=atom, shape=shape) + create_carray('/implicit_no/implicit_no', 'explicit_no', + atom=atom, shape=shape, filters=tb.Filters()) + create_carray('/implicit_no/implicit_no', 'explicit_yes', + atom=atom, shape=shape, filters=self.filters) + + create_group('/', 'explicit_yes', filters=self.filters) + create_group('/explicit_yes', 'implicit_yes') + create_carray('/explicit_yes/implicit_yes', 'implicit_yes', + atom=atom, shape=shape) + create_carray('/explicit_yes/implicit_yes', 'explicit_yes', + atom=atom, shape=shape, filters=self.filters) + create_carray('/explicit_yes/implicit_yes', 'explicit_no', + atom=atom, shape=shape, filters=tb.Filters()) + + def _check_filters(self, h5file, filters=None): + for node in h5file: + # Get node filters. + if hasattr(node, 'filters'): + node_filters = node.filters + else: + node_filters = node._v_filters + + # Compare to given filters. + if filters is not None: + self.assertEqual(node_filters, filters) + return + + # Guess filters to compare to by node name. + if node._v_name.endswith('_no'): + self.assertEqual( + node_filters, tb.Filters(), + "node ``%s`` should have no filters" % node._v_pathname) + elif node._v_name.endswith('_yes'): + self.assertEqual( + node_filters, self.filters, + "node ``%s`` should have filters" % node._v_pathname) + + def test00_propagate(self): + """Filters propagating to children.""" + + self._check_filters(self.h5file) + + def _test_copyFile(self, filters=None): + copyfname = tempfile.mktemp(suffix='.h5') + try: + self.h5file.copy_file(copyfname, filters=filters) + try: + copyf = tb.open_file(copyfname) + self._check_filters(copyf, filters=filters) + finally: + copyf.close() + finally: + Path(copyfname).unlink() + + def test01_copyFile(self): + """Keeping filters when copying a file.""" + + self._test_copyFile() + + def test02_copyFile_override(self): + """Overriding filters when copying a file.""" + + self._test_copyFile(self.filters) + + def _test_change(self, pathname, change_filters, new_filters): + group = self.h5file.get_node(pathname) + + # Check expected current filters. + old_filters = tb.Filters() + if pathname.endswith('_yes'): + old_filters = self.filters + self.assertEqual(group._v_filters, old_filters) + + # Change filters. + change_filters(group) + self.assertEqual(group._v_filters, new_filters) + + # Get and check changed filters. + if self._reopen(): + group = self.h5file.get_node(pathname) + self.assertEqual(group._v_filters, new_filters) + + def test03_change(self): + """Changing the filters of a group.""" + + def set_filters(group): + group._v_filters = self.filters + self._test_change('/', set_filters, self.filters) + + def test04_delete(self): + """Deleting the filters of a group.""" + + def del_filters(group): + del group._v_filters + self._test_change('/explicit_yes', del_filters, tb.Filters()) + + +@common.unittest.skipIf(not common.blosc_avail, 'BLOSC not available') +class SetBloscMaxThreadsTestCase(common.TempFileMixin, + common.PyTablesTestCase): + filters = tb.Filters(complevel=4, complib="blosc") + + def test00(self): + """Checking set_blosc_max_threads()""" + + nthreads_old = tb.set_blosc_max_threads(4) + if common.verbose: + print("Previous max threads:", nthreads_old) + print("Should be:", self.h5file.params['MAX_BLOSC_THREADS']) + self.assertEqual(nthreads_old, self.h5file.params['MAX_BLOSC_THREADS']) + self.h5file.create_carray('/', 'some_array', + atom=tb.Int32Atom(), shape=(3, 3), + filters=self.filters) + nthreads_old = tb.set_blosc_max_threads(1) + if common.verbose: + print("Previous max threads:", nthreads_old) + print("Should be:", 4) + self.assertEqual(nthreads_old, 4) + + def test01(self): + """Checking set_blosc_max_threads() (re-open)""" + + nthreads_old = tb.set_blosc_max_threads(4) + self.h5file.create_carray('/', 'some_array', + atom=tb.Int32Atom(), shape=(3, 3), + filters=self.filters) + self._reopen() + nthreads_old = tb.set_blosc_max_threads(4) + if common.verbose: + print("Previous max threads:", nthreads_old) + print("Should be:", self.h5file.params['MAX_BLOSC_THREADS']) + self.assertEqual(nthreads_old, self.h5file.params['MAX_BLOSC_THREADS']) + + +class FilterTestCase(common.PyTablesTestCase): + def test_filter_pack_type(self): + self.assertEqual(type(tb.Filters()._pack()), np.int64) + + @staticmethod + def _hexl(n): + return hex(int(n)) + + def test_filter_pack_01(self): + filter_ = tb.Filters() + self.assertEqual(self._hexl(filter_._pack()), '0x0') + + def test_filter_pack_02(self): + filter_ = tb.Filters(1, shuffle=False) + self.assertEqual(self._hexl(filter_._pack()), '0x101') + + def test_filter_pack_03(self): + filter_ = tb.Filters(9, 'zlib', shuffle=True, fletcher32=True) + self.assertEqual(self._hexl(filter_._pack()), '0x30109') + + def test_filter_pack_04(self): + filter_ = tb.Filters(1, shuffle=False, least_significant_digit=5) + self.assertEqual(self._hexl(filter_._pack()), '0x5040101') + + def test_filter_unpack_01(self): + filter_ = tb.Filters._unpack(np.int64(0x0)) + self.assertFalse(filter_.shuffle) + self.assertFalse(filter_.fletcher32) + self.assertEqual(filter_.least_significant_digit, None) + self.assertEqual(filter_.complevel, 0) + self.assertEqual(filter_.complib, None) + + def test_filter_unpack_02(self): + filter_ = tb.Filters._unpack(np.int64(0x101)) + self.assertFalse(filter_.shuffle) + self.assertFalse(filter_.fletcher32) + self.assertEqual(filter_.least_significant_digit, None) + self.assertEqual(filter_.complevel, 1) + self.assertEqual(filter_.complib, 'zlib') + + def test_filter_unpack_03(self): + filter_ = tb.Filters._unpack(np.int64(0x30109)) + self.assertTrue(filter_.shuffle) + self.assertTrue(filter_.fletcher32) + self.assertEqual(filter_.least_significant_digit, None) + self.assertEqual(filter_.complevel, 9) + self.assertEqual(filter_.complib, 'zlib') + + def test_filter_unpack_04(self): + filter_ = tb.Filters._unpack(np.int64(0x5040101)) + self.assertFalse(filter_.shuffle) + self.assertFalse(filter_.fletcher32) + self.assertEqual(filter_.least_significant_digit, 5) + self.assertEqual(filter_.complevel, 1) + self.assertEqual(filter_.complib, 'zlib') + + +class DefaultDriverTestCase(common.TempFileMixin, common.PyTablesTestCase): + DRIVER = None + DRIVER_PARAMS = {} + open_kwargs = dict(driver=DRIVER, **DRIVER_PARAMS) + + def setUp(self): + super().setUp() + + # Create an HDF5 file and contents + root = self.h5file.root + self.h5file.set_node_attr(root, "testattr", 41) + self.h5file.create_array(root, "array", [1, 2], title="array") + self.h5file.create_table(root, "table", {"var1": tb.IntCol()}, + title="table") + + def assertIsFile(self): + self.assertTrue(Path(self.h5fname).is_file()) + + def test_newFile(self): + self.assertIsInstance(self.h5file, tb.File) + self.assertIsFile() + + def test_readFile(self): + self.h5file.close() + self.h5file = None + + self.assertIsFile() + + # Open an existing HDF5 file + self.h5file = tb.open_file(self.h5fname, mode="r", + driver=self.DRIVER, + **self.DRIVER_PARAMS) + + # check contents + root = self.h5file.root + + self.assertEqual(self.h5file.get_node_attr(root, "testattr"), 41) + + self.assertIsInstance(root.array, tb.Array) + self.assertEqual(root.array._v_title, "array") + + self.assertIsInstance(root.table, tb.Table) + self.assertEqual(root.table._v_title, "table") + self.assertIn("var1", root.table.colnames) + self.assertEqual(root.table.cols.var1.dtype, tb.IntCol().dtype) + + def test_openFileA(self): + self.h5file.close() + self.h5file = None + + self.assertIsFile() + + # Open an existing HDF5 file in append mode + self.h5file = tb.open_file(self.h5fname, mode="a", + driver=self.DRIVER, + **self.DRIVER_PARAMS) + + # check contents + root = self.h5file.root + + self.assertEqual(self.h5file.get_node_attr(root, "testattr"), 41) + + self.assertIsInstance(root.array, tb.Array) + self.assertEqual(root.array._v_title, "array") + + self.assertIsInstance(root.table, tb.Table) + self.assertEqual(root.table._v_title, "table") + self.assertIn("var1", root.table.colnames) + self.assertEqual(root.table.cols.var1.dtype, tb.IntCol().dtype) + + # write new data + root = self.h5file.root + self.h5file.set_node_attr(root, "testattr2", 42) + self.h5file.create_array(root, "array2", [1, 2], title="array2") + self.h5file.create_table(root, "table2", {"var2": tb.FloatCol()}, + title="table2") + + # check contents + self._reopen(mode="a", driver=self.DRIVER, **self.DRIVER_PARAMS) + + root = self.h5file.root + + self.assertEqual(self.h5file.get_node_attr(root, "testattr"), 41) + self.assertEqual(self.h5file.get_node_attr(root, "testattr2"), 42) + + self.assertIsInstance(root.array, tb.Array) + self.assertEqual(root.array._v_title, "array") + + self.assertIsInstance(root.array2, tb.Array) + self.assertEqual(root.array2._v_title, "array2") + + self.assertIsInstance(root.table, tb.Table) + self.assertEqual(root.table._v_title, "table") + self.assertIn("var1", root.table.colnames) + self.assertEqual(root.table.cols.var1.dtype, tb.IntCol().dtype) + + self.assertIsInstance(root.table2, tb.Table) + self.assertEqual(root.table2._v_title, "table2") + self.assertIn("var2", root.table2.colnames) + self.assertEqual(root.table2.cols.var2.dtype, tb.FloatCol().dtype) + + def test_openFileRW(self): + self.h5file.close() + self.h5file = None + + self.assertIsFile() + + # Open an existing HDF5 file in append mode + self.h5file = tb.open_file(self.h5fname, mode="r+", + driver=self.DRIVER, + **self.DRIVER_PARAMS) + + # check contents + root = self.h5file.root + + self.assertEqual(self.h5file.get_node_attr(root, "testattr"), 41) + + self.assertIsInstance(root.array, tb.Array) + self.assertEqual(root.array._v_title, "array") + + self.assertIsInstance(root.table, tb.Table) + self.assertEqual(root.table._v_title, "table") + self.assertIn("var1", root.table.colnames) + self.assertEqual(root.table.cols.var1.dtype, tb.IntCol().dtype) + + # write new data + self.h5file.set_node_attr(root, "testattr2", 42) + self.h5file.create_array(root, "array2", [1, 2], title="array2") + self.h5file.create_table(root, "table2", {"var2": tb.FloatCol()}, + title="table2") + + # check contents + self._reopen(mode="r+", driver=self.DRIVER, **self.DRIVER_PARAMS) + + root = self.h5file.root + + self.assertEqual(self.h5file.get_node_attr(root, "testattr"), 41) + self.assertEqual(self.h5file.get_node_attr(root, "testattr2"), 42) + + self.assertIsInstance(root.array, tb.Array) + self.assertEqual(root.array._v_title, "array") + + self.assertIsInstance(root.array2, tb.Array) + self.assertEqual(root.array2._v_title, "array2") + + self.assertIsInstance(root.table, tb.Table) + self.assertEqual(root.table._v_title, "table") + self.assertIn("var1", root.table.colnames) + self.assertEqual(root.table.cols.var1.dtype, tb.IntCol().dtype) + + self.assertIsInstance(root.table2, tb.Table) + self.assertEqual(root.table2._v_title, "table2") + self.assertIn("var2", root.table2.colnames) + self.assertEqual(root.table2.cols.var2.dtype, tb.FloatCol().dtype) + + +@common.unittest.skipIf(common.hdf5_version < Version("1.8.9"), + "requires HDF5 >= 1.8,9") +class Sec2DriverTestCase(DefaultDriverTestCase): + DRIVER = "H5FD_SEC2" + open_kwargs = dict(driver=DRIVER, **DefaultDriverTestCase.DRIVER_PARAMS) + + def test_get_file_image(self): + image = self.h5file.get_file_image() + self.assertGreater(len(image), 0) + self.assertEqual([i for i in image[:4]], [137, 72, 68, 70]) + + +@common.unittest.skipIf(common.hdf5_version < Version("1.8.9"), + "requires HDF5 >= 1.8,9") +class StdioDriverTestCase(DefaultDriverTestCase): + DRIVER = "H5FD_STDIO" + open_kwargs = dict(driver=DRIVER, **DefaultDriverTestCase.DRIVER_PARAMS) + + def test_get_file_image(self): + image = self.h5file.get_file_image() + self.assertGreater(len(image), 0) + self.assertEqual([i for i in image[:4]], [137, 72, 68, 70]) + + +@common.unittest.skipIf(common.hdf5_version < Version("1.8.9"), + "requires HDF5 >= 1.8,9") +class CoreDriverTestCase(DefaultDriverTestCase): + DRIVER = "H5FD_CORE" + open_kwargs = dict(driver=DRIVER, **DefaultDriverTestCase.DRIVER_PARAMS) + + def test_get_file_image(self): + image = self.h5file.get_file_image() + self.assertGreater(len(image), 0) + self.assertEqual([i for i in image[:4]], [137, 72, 68, 70]) + + +class CoreDriverNoBackingStoreTestCase(common.PyTablesTestCase): + DRIVER = "H5FD_CORE" + + def setUp(self): + super().setUp() + + self.h5fname = tempfile.mktemp(suffix=".h5") + self.h5file = None + + def tearDown(self): + if self.h5file: + self.h5file.close() + elif self.h5fname in tb.file._open_files: + open_files = tb.file._open_files + for h5file in open_files.get_handlers_by_name(self.h5fname): + h5file.close() + + self.h5file = None + if Path(self.h5fname).is_file(): + Path(self.h5fname).unlink() + + super().tearDown() + + def test_newFile(self): + """Ensure that nothing is written to file.""" + + self.assertFalse(Path(self.h5fname).is_file()) + + self.h5file = tb.open_file(self.h5fname, mode="w", + driver=self.DRIVER, + driver_core_backing_store=False) + + # Create an HDF5 file and contents + root = self.h5file.root + self.h5file.set_node_attr(root, "testattr", 41) + self.h5file.create_array(root, "array", [1, 2], title="array") + self.h5file.create_table(root, "table", {"var1": tb.IntCol()}, + title="table") + self.h5file.close() # flush + + self.assertFalse(Path(self.h5fname).is_file()) + + def test_readNewFileW(self): + self.assertFalse(Path(self.h5fname).is_file()) + + # Create an HDF5 file and contents + self.h5file = tb.open_file(self.h5fname, mode="w", + driver=self.DRIVER, + driver_core_backing_store=False) + root = self.h5file.root + self.h5file.set_node_attr(root, "testattr", 41) + self.h5file.create_array(root, "array", [1, 2], title="array") + self.h5file.create_table(root, "table", {"var1": tb.IntCol()}, + title="table") + + self.assertEqual(self.h5file.get_node_attr(root, "testattr"), 41) + + self.assertIsInstance(root.array, tb.Array) + self.assertEqual(root.array._v_title, "array") + + self.assertIsInstance(root.table, tb.Table) + self.assertEqual(root.table._v_title, "table") + self.assertIn("var1", root.table.colnames) + self.assertEqual(root.table.cols.var1.dtype, tb.IntCol().dtype) + + self.h5file.close() # flush + + self.assertFalse(Path(self.h5fname).is_file()) + + def test_readNewFileA(self): + self.assertFalse(Path(self.h5fname).is_file()) + + # Create an HDF5 file and contents + self.h5file = tb.open_file(self.h5fname, mode="a", + driver=self.DRIVER, + driver_core_backing_store=False) + root = self.h5file.root + self.h5file.set_node_attr(root, "testattr", 41) + self.h5file.create_array(root, "array", [1, 2], title="array") + self.h5file.create_table(root, "table", {"var1": tb.IntCol()}, + title="table") + + self.assertEqual(self.h5file.get_node_attr(root, "testattr"), 41) + + self.assertIsInstance(root.array, tb.Array) + self.assertEqual(root.array._v_title, "array") + + self.assertIsInstance(root.table, tb.Table) + self.assertEqual(root.table._v_title, "table") + self.assertIn("var1", root.table.colnames) + self.assertEqual(root.table.cols.var1.dtype, tb.IntCol().dtype) + + self.h5file.close() # flush + + self.assertFalse(Path(self.h5fname).is_file()) + + def test_openNewFileRW(self): + self.assertFalse(Path(self.h5fname).is_file()) + self.assertRaises(tb.HDF5ExtError, + tb.open_file, self.h5fname, mode="r+", + driver=self.DRIVER, driver_core_backing_store=False) + + def test_openNewFileR(self): + self.assertFalse(Path(self.h5fname).is_file()) + self.assertRaises(tb.HDF5ExtError, + tb.open_file, self.h5fname, mode="r", + driver=self.DRIVER, driver_core_backing_store=False) + + def _create_file(self, filename): + h5file = tb.open_file(filename, mode="w") + + root = h5file.root + h5file.set_node_attr(root, "testattr", 41) + h5file.create_array(root, "array", [1, 2], title="array") + h5file.create_table(root, "table", {"var1": tb.IntCol()}, + title="table") + + h5file.close() + + def test_readFile(self): + self._create_file(self.h5fname) + self.assertTrue(Path(self.h5fname).is_file()) + + # Open an existing HDF5 file + self.h5file = tb.open_file(self.h5fname, mode="r", + driver=self.DRIVER, + driver_core_backing_store=False) + root = self.h5file.root + + self.assertEqual(self.h5file.get_node_attr(root, "testattr"), 41) + + self.assertIsInstance(root.array, tb.Array) + self.assertEqual(root.array._v_title, "array") + + self.assertIsInstance(root.table, tb.Table) + self.assertEqual(root.table._v_title, "table") + self.assertIn("var1", root.table.colnames) + self.assertEqual(root.table.cols.var1.dtype, tb.IntCol().dtype) + + def _get_digest(self, filename): + md5 = hashlib.md5() + md5.update(Path(filename).read_bytes()) + hexdigest = md5.hexdigest() + return hexdigest + + def test_openFileA(self): + self._create_file(self.h5fname) + self.assertTrue(Path(self.h5fname).is_file()) + + # compute the file hash + hexdigest = self._get_digest(self.h5fname) + + # Open an existing HDF5 file in append mode + self.h5file = tb.open_file(self.h5fname, mode="a", + driver=self.DRIVER, + driver_core_backing_store=False) + + # check contents + root = self.h5file.root + + self.assertEqual(self.h5file.get_node_attr(root, "testattr"), 41) + + self.assertIsInstance(root.array, tb.Array) + self.assertEqual(root.array._v_title, "array") + + self.assertIsInstance(root.table, tb.Table) + self.assertEqual(root.table._v_title, "table") + self.assertIn("var1", root.table.colnames) + self.assertEqual(root.table.cols.var1.dtype, tb.IntCol().dtype) + + # write new data + root = self.h5file.root + self.h5file.set_node_attr(root, "testattr2", 42) + self.h5file.create_array(root, "array2", [1, 2], title="array2") + self.h5file.create_table(root, "table2", {"var2": tb.FloatCol()}, + title="table2") + self.h5file.close() + + # ensure that there is no change on the file on disk + self.assertEqual(hexdigest, self._get_digest(self.h5fname)) + + def test_openFileRW(self): + self._create_file(self.h5fname) + self.assertTrue(Path(self.h5fname).is_file()) + + # compute the file hash + hexdigest = self._get_digest(self.h5fname) + + # Open an existing HDF5 file in append mode + self.h5file = tb.open_file(self.h5fname, mode="r+", + driver=self.DRIVER, + driver_core_backing_store=False) + + # check contents + root = self.h5file.root + + self.assertEqual(self.h5file.get_node_attr(root, "testattr"), 41) + + self.assertIsInstance(root.array, tb.Array) + self.assertEqual(root.array._v_title, "array") + + self.assertIsInstance(root.table, tb.Table) + self.assertEqual(root.table._v_title, "table") + self.assertIn("var1", root.table.colnames) + self.assertEqual(root.table.cols.var1.dtype, tb.IntCol().dtype) + + # write new data + root = self.h5file.root + self.h5file.set_node_attr(root, "testattr2", 42) + self.h5file.create_array(root, "array2", [1, 2], title="array2") + self.h5file.create_table(root, "table2", {"var2": tb.FloatCol()}, + title="table2") + self.h5file.close() + + # ensure that there is no change on the file on disk + self.assertEqual(hexdigest, self._get_digest(self.h5fname)) + + @common.unittest.skipIf(common.hdf5_version < Version("1.8.9"), + 'HDF5 >= "1.8.9" required') + def test_get_file_image(self): + self.h5file = tb.open_file(self.h5fname, mode="w", + driver=self.DRIVER, + driver_core_backing_store=False) + root = self.h5file.root + self.h5file.set_node_attr(root, "testattr", 41) + self.h5file.create_array(root, "array", [1, 2], title="array") + self.h5file.create_table(root, "table", {"var1": tb.IntCol()}, + title="table") + + image = self.h5file.get_file_image() + + self.assertGreater(len(image), 0) + self.assertEqual([i for i in image[:4]], [137, 72, 68, 70]) + + +class SplitDriverTestCase(DefaultDriverTestCase): + DRIVER = "H5FD_SPLIT" + DRIVER_PARAMS = { + "driver_split_meta_ext": "-xm.h5", + "driver_split_raw_ext": "-xr.h5", + } + open_kwargs = dict(driver=DRIVER, **DRIVER_PARAMS) + + def _getTempFileName(self): + return tempfile.mktemp(prefix=self._getName()) + + def setUp(self): + super().setUp() + + self.h5fnames = [self.h5fname + self.DRIVER_PARAMS[k] for k in + ("driver_split_meta_ext", "driver_split_raw_ext")] + + def tearDown(self): + self.h5file.close() + for fname in self.h5fnames: + if Path(fname).is_file(): + Path(fname).unlink() + # super().tearDown() + common.PyTablesTestCase.tearDown(self) + + def assertIsFile(self): + for fname in self.h5fnames: + self.assertTrue(Path(fname).is_file()) + + +class NotSpportedDriverTestCase(common.PyTablesTestCase): + DRIVER = None + DRIVER_PARAMS = {} + EXCEPTION = ValueError + + def setUp(self): + super().setUp() + self.h5fname = tempfile.mktemp(suffix=".h5") + + def tearDown(self): + open_files = tb.file._open_files + if self.h5fname in open_files: + for h5file in open_files.get_handlers_by_name(self.h5fname): + h5file.close() + if Path(self.h5fname).is_file(): + Path(self.h5fname).unlink() + super().tearDown() + + def test_newFile(self): + self.assertRaises(self.EXCEPTION, tb.open_file, self.h5fname, + mode="w", driver=self.DRIVER, **self.DRIVER_PARAMS) + self.assertFalse(Path(self.h5fname).is_file()) + + +if "H5FD_LOG" in tb.hdf5extension._supported_drivers: + BaseLogDriverTestCase = DefaultDriverTestCase + +else: + BaseLogDriverTestCase = NotSpportedDriverTestCase + + +class LogDriverTestCase(BaseLogDriverTestCase): + DRIVER = "H5FD_LOG" + open_kwargs = dict(driver=DRIVER, **BaseLogDriverTestCase.DRIVER_PARAMS) + + def setUp(self): + # local binding + self.DRIVER_PARAMS = { + "driver_log_file": tempfile.mktemp(suffix=".log") + } + + super().setUp() + + def tearDown(self): + if Path(self.DRIVER_PARAMS["driver_log_file"]).is_file(): + Path(self.DRIVER_PARAMS["driver_log_file"]).unlink() + super().tearDown() + + +if tb.hdf5extension.HAVE_DIRECT_DRIVER: + class DirectDriverTestCase(DefaultDriverTestCase): + DRIVER = "H5FD_DIRECT" + open_kwargs = dict( + driver=DRIVER, **DefaultDriverTestCase.DRIVER_PARAMS + ) + +else: + class DirectDriverTestCase(NotSpportedDriverTestCase): + DRIVER = "H5FD_DIRECT" + EXCEPTION = RuntimeError + + +if tb.hdf5extension.HAVE_WINDOWS_DRIVER: + class WindowsDriverTestCase(DefaultDriverTestCase): + DRIVER = "H5FD_WINDOWS" + open_kwargs = dict( + driver=DRIVER, **DefaultDriverTestCase.DRIVER_PARAMS + ) + +else: + class WindowsDriverTestCase(NotSpportedDriverTestCase): + DRIVER = "H5FD_WINDOWS" + EXCEPTION = RuntimeError + + +class FamilyDriverTestCase(NotSpportedDriverTestCase): + DRIVER = "H5FD_FAMILY" + + +class MultiDriverTestCase(NotSpportedDriverTestCase): + DRIVER = "H5FD_MULTI" + + +class MpioDriverTestCase(NotSpportedDriverTestCase): + DRIVER = "H5FD_MPIO" + + +class MpiPosixDriverTestCase(NotSpportedDriverTestCase): + DRIVER = "H5FD_MPIPOSIX" + + +class StreamDriverTestCase(NotSpportedDriverTestCase): + DRIVER = "H5FD_STREAM" + + +@common.unittest.skipIf(common.hdf5_version < Version("1.8.9"), + 'HDF5 >= "1.8.9" required') +class InMemoryCoreDriverTestCase(common.PyTablesTestCase): + DRIVER = "H5FD_CORE" + + def setUp(self): + super().setUp() + self.h5fname = tempfile.mktemp(".h5") + self.h5file = None + + def tearDown(self): + if self.h5file: + self.h5file.close() + self.h5file = None + + if Path(self.h5fname).is_file(): + Path(self.h5fname).unlink() + super().tearDown() + + def _create_image(self, filename="in-memory", title="Title", mode='w'): + h5file = tb.open_file(filename, mode=mode, title=title, + driver=self.DRIVER, + driver_core_backing_store=0) + + try: + h5file.create_array(h5file.root, 'array', [1, 2], title="Array") + h5file.create_table(h5file.root, 'table', { + 'var1': tb.IntCol()}, "Table") + h5file.root._v_attrs.testattr = 41 + + image = h5file.get_file_image() + finally: + h5file.close() + + return image + + def test_newFileW(self): + image = self._create_image(self.h5fname, mode='w') + self.assertGreater(len(image), 0) + self.assertEqual([i for i in image[:4]], [137, 72, 68, 70]) + self.assertFalse(Path(self.h5fname).exists()) + + def test_newFileA(self): + image = self._create_image(self.h5fname, mode='a') + self.assertGreater(len(image), 0) + self.assertEqual([i for i in image[:4]], [137, 72, 68, 70]) + self.assertFalse(Path(self.h5fname).exists()) + + def test_openFileR(self): + image = self._create_image(self.h5fname) + self.assertFalse(Path(self.h5fname).exists()) + + # Open an existing file + self.h5file = tb.open_file(self.h5fname, mode="r", + driver=self.DRIVER, + driver_core_image=image, + driver_core_backing_store=0) + + # Get the CLASS attribute of the arr object + self.assertTrue(hasattr(self.h5file.root._v_attrs, "TITLE")) + self.assertEqual(self.h5file.get_node_attr("/", "TITLE"), "Title") + self.assertTrue(hasattr(self.h5file.root._v_attrs, "testattr")) + self.assertEqual(self.h5file.get_node_attr("/", "testattr"), 41) + self.assertTrue(hasattr(self.h5file.root, "array")) + self.assertEqual(self.h5file.get_node_attr("/array", "TITLE"), "Array") + self.assertTrue(hasattr(self.h5file.root, "table")) + self.assertEqual(self.h5file.get_node_attr("/table", "TITLE"), "Table") + self.assertEqual(self.h5file.root.array.read(), [1, 2]) + + def test_openFileRW(self): + image = self._create_image(self.h5fname) + self.assertFalse(Path(self.h5fname).exists()) + + # Open an existing file + self.h5file = tb.open_file(self.h5fname, mode="r+", + driver=self.DRIVER, + driver_core_image=image, + driver_core_backing_store=0) + + # Get the CLASS attribute of the arr object + self.assertTrue(hasattr(self.h5file.root._v_attrs, "TITLE")) + self.assertEqual(self.h5file.get_node_attr("/", "TITLE"), "Title") + self.assertTrue(hasattr(self.h5file.root._v_attrs, "testattr")) + self.assertEqual(self.h5file.get_node_attr("/", "testattr"), 41) + self.assertTrue(hasattr(self.h5file.root, "array")) + self.assertEqual(self.h5file.get_node_attr("/array", "TITLE"), "Array") + self.assertTrue(hasattr(self.h5file.root, "table")) + self.assertEqual(self.h5file.get_node_attr("/table", "TITLE"), "Table") + self.assertEqual(self.h5file.root.array.read(), [1, 2]) + + self.h5file.create_array(self.h5file.root, 'array2', + list(range(10_000)), + title="Array2") + self.h5file.root._v_attrs.testattr2 = 42 + + self.h5file.close() + + self.assertFalse(Path(self.h5fname).exists()) + + def test_openFileRW_update(self): + filename = tempfile.mktemp(".h5") + image1 = self._create_image(filename) + self.assertFalse(Path(self.h5fname).exists()) + + # Open an existing file + self.h5file = tb.open_file(self.h5fname, mode="r+", + driver=self.DRIVER, + driver_core_image=image1, + driver_core_backing_store=0) + + # Get the CLASS attribute of the arr object + self.assertTrue(hasattr(self.h5file.root._v_attrs, "TITLE")) + self.assertEqual(self.h5file.get_node_attr("/", "TITLE"), "Title") + self.assertTrue(hasattr(self.h5file.root._v_attrs, "testattr")) + self.assertEqual(self.h5file.get_node_attr("/", "testattr"), 41) + self.assertTrue(hasattr(self.h5file.root, "array")) + self.assertEqual(self.h5file.get_node_attr("/array", "TITLE"), "Array") + self.assertTrue(hasattr(self.h5file.root, "table")) + self.assertEqual(self.h5file.get_node_attr("/table", "TITLE"), "Table") + self.assertEqual(self.h5file.root.array.read(), [1, 2]) + + data = list(range(2 * tb.parameters.DRIVER_CORE_INCREMENT)) + self.h5file.create_array(self.h5file.root, 'array2', data, + title="Array2") + self.h5file.root._v_attrs.testattr2 = 42 + + image2 = self.h5file.get_file_image() + + self.h5file.close() + + self.assertFalse(Path(self.h5fname).exists()) + + self.assertNotEqual(len(image1), len(image2)) + self.assertNotEqual(image1, image2) + + # Open an existing file + self.h5file = tb.open_file(self.h5fname, mode="r", + driver=self.DRIVER, + driver_core_image=image2, + driver_core_backing_store=0) + + # Get the CLASS attribute of the arr object + self.assertTrue(hasattr(self.h5file.root._v_attrs, "TITLE")) + self.assertEqual(self.h5file.get_node_attr("/", "TITLE"), "Title") + self.assertTrue(hasattr(self.h5file.root._v_attrs, "testattr")) + self.assertEqual(self.h5file.get_node_attr("/", "testattr"), 41) + self.assertTrue(hasattr(self.h5file.root, "array")) + self.assertEqual(self.h5file.get_node_attr("/array", "TITLE"), "Array") + self.assertTrue(hasattr(self.h5file.root, "table")) + self.assertEqual(self.h5file.get_node_attr("/table", "TITLE"), "Table") + self.assertEqual(self.h5file.root.array.read(), [1, 2]) + + self.assertTrue(hasattr(self.h5file.root._v_attrs, "testattr2")) + self.assertEqual(self.h5file.get_node_attr("/", "testattr2"), 42) + self.assertTrue(hasattr(self.h5file.root, "array2")) + self.assertEqual(self.h5file.get_node_attr( + "/array2", "TITLE"), "Array2") + self.assertEqual(self.h5file.root.array2.read(), data) + + self.h5file.close() + + self.assertFalse(Path(self.h5fname).exists()) + + def test_openFileA(self): + image = self._create_image(self.h5fname) + self.assertFalse(Path(self.h5fname).exists()) + + # Open an existing file + self.h5file = tb.open_file(self.h5fname, mode="a", + driver=self.DRIVER, + driver_core_image=image, + driver_core_backing_store=0) + + # Get the CLASS attribute of the arr object + self.assertTrue(hasattr(self.h5file.root._v_attrs, "TITLE")) + self.assertEqual(self.h5file.get_node_attr("/", "TITLE"), "Title") + self.assertTrue(hasattr(self.h5file.root._v_attrs, "testattr")) + self.assertEqual(self.h5file.get_node_attr("/", "testattr"), 41) + self.assertTrue(hasattr(self.h5file.root, "array")) + self.assertEqual(self.h5file.get_node_attr("/array", "TITLE"), "Array") + self.assertTrue(hasattr(self.h5file.root, "table")) + self.assertEqual(self.h5file.get_node_attr("/table", "TITLE"), "Table") + self.assertEqual(self.h5file.root.array.read(), [1, 2]) + + self.h5file.close() + + self.assertFalse(Path(self.h5fname).exists()) + + def test_openFileA_update(self): + h5fname = tempfile.mktemp(".h5") + image1 = self._create_image(h5fname) + self.assertFalse(Path(self.h5fname).exists()) + + # Open an existing file + self.h5file = tb.open_file(self.h5fname, mode="a", + driver=self.DRIVER, + driver_core_image=image1, + driver_core_backing_store=0) + + # Get the CLASS attribute of the arr object + self.assertTrue(hasattr(self.h5file.root._v_attrs, "TITLE")) + self.assertEqual(self.h5file.get_node_attr("/", "TITLE"), "Title") + self.assertTrue(hasattr(self.h5file.root._v_attrs, "testattr")) + self.assertEqual(self.h5file.get_node_attr("/", "testattr"), 41) + self.assertTrue(hasattr(self.h5file.root, "array")) + self.assertEqual(self.h5file.get_node_attr("/array", "TITLE"), "Array") + self.assertTrue(hasattr(self.h5file.root, "table")) + self.assertEqual(self.h5file.get_node_attr("/table", "TITLE"), "Table") + self.assertEqual(self.h5file.root.array.read(), [1, 2]) + + data = list(range(2 * tb.parameters.DRIVER_CORE_INCREMENT)) + self.h5file.create_array(self.h5file.root, 'array2', data, + title="Array2") + self.h5file.root._v_attrs.testattr2 = 42 + + image2 = self.h5file.get_file_image() + + self.h5file.close() + + self.assertFalse(Path(self.h5fname).exists()) + + self.assertNotEqual(len(image1), len(image2)) + self.assertNotEqual(image1, image2) + + # Open an existing file + self.h5file = tb.open_file(self.h5fname, mode="r", + driver=self.DRIVER, + driver_core_image=image2, + driver_core_backing_store=0) + + # Get the CLASS attribute of the arr object + self.assertTrue(hasattr(self.h5file.root._v_attrs, "TITLE")) + self.assertEqual(self.h5file.get_node_attr("/", "TITLE"), "Title") + self.assertTrue(hasattr(self.h5file.root._v_attrs, "testattr")) + self.assertEqual(self.h5file.get_node_attr("/", "testattr"), 41) + self.assertTrue(hasattr(self.h5file.root, "array")) + self.assertEqual(self.h5file.get_node_attr("/array", "TITLE"), "Array") + self.assertTrue(hasattr(self.h5file.root, "table")) + self.assertEqual(self.h5file.get_node_attr("/table", "TITLE"), "Table") + self.assertEqual(self.h5file.root.array.read(), [1, 2]) + + self.assertTrue(hasattr(self.h5file.root._v_attrs, "testattr2")) + self.assertEqual(self.h5file.get_node_attr("/", "testattr2"), 42) + self.assertTrue(hasattr(self.h5file.root, "array2")) + self.assertEqual(self.h5file.get_node_attr( + "/array2", "TITLE"), "Array2") + self.assertEqual(self.h5file.root.array2.read(), data) + + self.h5file.close() + + self.assertFalse(Path(self.h5fname).exists()) + + def test_str(self): + self.h5file = tb.open_file(self.h5fname, mode="w", title="Title", + driver=self.DRIVER, + driver_core_backing_store=0) + + self.h5file.create_array(self.h5file.root, 'array', [1, 2], + title="Array") + self.h5file.create_table(self.h5file.root, 'table', + {'var1': tb.IntCol()}, "Table") + self.h5file.root._v_attrs.testattr = 41 + + # ensure that the __str__ method works even if there is no phisical + # file on disk (in which case the os.stat operation for date retrieval + # fails) + self.assertIsNotNone(str(self.h5file)) + + self.h5file.close() + self.assertFalse(Path(self.h5fname).exists()) + + +class QuantizeTestCase(common.TempFileMixin, common.PyTablesTestCase): + mode = "w" + title = "This is the table title" + expectedrows = 10 + appendrows = 5 + + def setUp(self): + super().setUp() + + self.data = np.linspace(-5., 5., 41) + self.randomdata = np.random.random_sample(1_000_000) + self.randomints = np.random.randint( + -1_000_000, 1_000_000, 1_000_000).astype('int64') + + self.populateFile() + self.h5file.close() + + self.quantizeddata_0 = np.asarray( + [-5.] * 2 + [-4.] * 5 + [-3.] * 3 + [-2.] * 5 + [-1.] * 3 + + [0.] * 5 + [1.] * 3 + [2.] * 5 + [3.] * 3 + [4.] * 5 + [5.] * 2) + self.quantizeddata_m1 = np.asarray( + [-8.] * 4 + [0.] * 33 + [8.] * 4) + + def populateFile(self): + root = self.h5file.root + filters = tb.Filters(complevel=1, complib="blosc", + least_significant_digit=1) + ints = self.h5file.create_carray(root, "integers", tb.Int64Atom(), + (1_000_000,), filters=filters) + ints[:] = self.randomints + floats = self.h5file.create_carray(root, "floats", tb.Float32Atom(), + (1_000_000,), filters=filters) + floats[:] = self.randomdata + data1 = self.h5file.create_carray(root, "data1", tb.Float64Atom(), + (41,), filters=filters) + data1[:] = self.data + filters = tb.Filters(complevel=1, complib="blosc", + least_significant_digit=0) + data0 = self.h5file.create_carray(root, "data0", tb.Float64Atom(), + (41,), filters=filters) + data0[:] = self.data + filters = tb.Filters(complevel=1, complib="blosc", + least_significant_digit=2) + data2 = self.h5file.create_carray(root, "data2", tb.Float64Atom(), + (41,), filters=filters) + data2[:] = self.data + filters = tb.Filters(complevel=1, complib="blosc", + least_significant_digit=-1) + datam1 = self.h5file.create_carray(root, "datam1", tb.Float64Atom(), + (41,), filters=filters) + datam1[:] = self.data + + def test00_quantizeData(self): + """Checking the quantize() function.""" + + quantized_0 = tb.utils.quantize(self.data, 0) + quantized_1 = tb.utils.quantize(self.data, 1) + quantized_2 = tb.utils.quantize(self.data, 2) + quantized_m1 = tb.utils.quantize(self.data, -1) + np.testing.assert_array_equal(quantized_0, self.quantizeddata_0) + np.testing.assert_array_equal(quantized_1, self.data) + np.testing.assert_array_equal(quantized_2, self.data) + np.testing.assert_array_equal(quantized_m1, self.quantizeddata_m1) + + def test01_quantizeDataMaxError(self): + """Checking the maximum error introduced by the quantize() function.""" + + quantized_0 = tb.utils.quantize(self.randomdata, 0) + quantized_1 = tb.utils.quantize(self.randomdata, 1) + quantized_2 = tb.utils.quantize(self.randomdata, 2) + quantized_m1 = tb.utils.quantize(self.randomdata, -1) + + self.assertLess(np.abs(quantized_0 - self.randomdata).max(), 0.5) + self.assertLess(np.abs(quantized_1 - self.randomdata).max(), 0.05) + self.assertLess(np.abs(quantized_2 - self.randomdata).max(), 0.005) + self.assertLess(np.abs(quantized_m1 - self.randomdata).max(), 1.) + + def test02_array(self): + """Checking quantized data as written to disk.""" + + self.h5file = tb.open_file(self.h5fname, "r") + np.testing.assert_array_equal(self.h5file.root.data1[:], self.data) + np.testing.assert_array_equal(self.h5file.root.data2[:], self.data) + np.testing.assert_array_equal(self.h5file.root.data0[:], + self.quantizeddata_0) + np.testing.assert_array_equal(self.h5file.root.datam1[:], + self.quantizeddata_m1) + np.testing.assert_array_equal(self.h5file.root.integers[:], + self.randomints) + self.assertEqual(self.h5file.root.integers[:].dtype, + self.randomints.dtype) + + self.assertLess( + np.abs(self.h5file.root.floats[:] - self.randomdata).max(), 0.05) + + +def suite(): + import doctest + + theSuite = common.unittest.TestSuite() + niter = 1 + # common.heavy = 1 # Uncomment this only for testing purposes! + + for i in range(niter): + theSuite.addTest(common.unittest.makeSuite(FiltersCase1)) + theSuite.addTest(common.unittest.makeSuite(FiltersCase2)) + theSuite.addTest(common.unittest.makeSuite(FiltersCase10)) + theSuite.addTest(common.unittest.makeSuite(FiltersCaseBloscBloscLZ)) + theSuite.addTest(common.unittest.makeSuite(FiltersCaseBloscLZ4)) + theSuite.addTest(common.unittest.makeSuite(FiltersCaseBloscLZ4HC)) + theSuite.addTest(common.unittest.makeSuite(FiltersCaseBloscSnappy)) + theSuite.addTest(common.unittest.makeSuite(FiltersCaseBloscZlib)) + theSuite.addTest(common.unittest.makeSuite(FiltersCaseBloscZstd)) + theSuite.addTest(common.unittest.makeSuite(FiltersCaseBloscBitShuffle)) + theSuite.addTest(common.unittest.makeSuite(CopyGroupCase1)) + theSuite.addTest(common.unittest.makeSuite(CopyGroupCase2)) + theSuite.addTest(common.unittest.makeSuite(CopyFileCase1)) + theSuite.addTest(common.unittest.makeSuite(CopyFileCase2)) + theSuite.addTest(common.unittest.makeSuite(GroupFiltersTestCase)) + theSuite.addTest(common.unittest.makeSuite(SetBloscMaxThreadsTestCase)) + theSuite.addTest(common.unittest.makeSuite(FilterTestCase)) + theSuite.addTest(doctest.DocTestSuite(tb.filters)) + + theSuite.addTest(common.unittest.makeSuite(DefaultDriverTestCase)) + theSuite.addTest(common.unittest.makeSuite(Sec2DriverTestCase)) + theSuite.addTest(common.unittest.makeSuite(StdioDriverTestCase)) + theSuite.addTest(common.unittest.makeSuite(CoreDriverTestCase)) + theSuite.addTest(common.unittest.makeSuite( + CoreDriverNoBackingStoreTestCase)) + theSuite.addTest(common.unittest.makeSuite(SplitDriverTestCase)) + + theSuite.addTest(common.unittest.makeSuite(LogDriverTestCase)) + theSuite.addTest(common.unittest.makeSuite(DirectDriverTestCase)) + theSuite.addTest(common.unittest.makeSuite(WindowsDriverTestCase)) + + theSuite.addTest(common.unittest.makeSuite(FamilyDriverTestCase)) + theSuite.addTest(common.unittest.makeSuite(MultiDriverTestCase)) + theSuite.addTest(common.unittest.makeSuite(MpioDriverTestCase)) + theSuite.addTest(common.unittest.makeSuite(MpiPosixDriverTestCase)) + theSuite.addTest(common.unittest.makeSuite(StreamDriverTestCase)) + theSuite.addTest(common.unittest.makeSuite(InMemoryCoreDriverTestCase)) + + theSuite.addTest(common.unittest.makeSuite(QuantizeTestCase)) + + if common.heavy: + theSuite.addTest(common.unittest.makeSuite(CreateTestCase)) + theSuite.addTest(common.unittest.makeSuite(FiltersCase3)) + theSuite.addTest(common.unittest.makeSuite(FiltersCase4)) + theSuite.addTest(common.unittest.makeSuite(FiltersCase5)) + theSuite.addTest(common.unittest.makeSuite(FiltersCase6)) + theSuite.addTest(common.unittest.makeSuite(FiltersCase7)) + theSuite.addTest(common.unittest.makeSuite(FiltersCase8)) + theSuite.addTest(common.unittest.makeSuite(FiltersCase9)) + theSuite.addTest(common.unittest.makeSuite(CopyFileCase3)) + theSuite.addTest(common.unittest.makeSuite(CopyFileCase4)) + theSuite.addTest(common.unittest.makeSuite(CopyFileCase5)) + theSuite.addTest(common.unittest.makeSuite(CopyFileCase6)) + theSuite.addTest(common.unittest.makeSuite(CopyFileCase7)) + theSuite.addTest(common.unittest.makeSuite(CopyFileCase8)) + theSuite.addTest(common.unittest.makeSuite(CopyFileCase10)) + theSuite.addTest(common.unittest.makeSuite(CopyGroupCase3)) + theSuite.addTest(common.unittest.makeSuite(CopyGroupCase4)) + theSuite.addTest(common.unittest.makeSuite(CopyGroupCase5)) + theSuite.addTest(common.unittest.makeSuite(CopyGroupCase6)) + theSuite.addTest(common.unittest.makeSuite(CopyGroupCase7)) + theSuite.addTest(common.unittest.makeSuite(CopyGroupCase8)) + + return theSuite + + +if __name__ == '__main__': + common.parse_argv(sys.argv) + common.print_versions() + common.unittest.main(defaultTest='suite') diff --git a/tables/tests/test_do_undo.py b/tables/tests/test_do_undo.py new file mode 100644 index 0000000..bbedb7d --- /dev/null +++ b/tables/tests/test_do_undo.py @@ -0,0 +1,2730 @@ +import warnings + +import tables as tb +from tables.tests import common + + +class BasicTestCase(common.TempFileMixin, common.PyTablesTestCase): + """Test for basic Undo/Redo operations.""" + + _reopen_flag = False + """Whether to reopen the file at certain points.""" + + def _do_reopen(self): + if self._reopen_flag: + self._reopen('r+') + + def setUp(self): + super().setUp() + + h5file = self.h5file + root = h5file.root + + # Create an array + h5file.create_array(root, 'array', [1, 2], title="Title example") + + # Create another array object + h5file.create_array(root, 'anarray', [1], "Array title") + + # Create a group object + group = h5file.create_group(root, 'agroup', "Group title") + + # Create a couple of objects there + h5file.create_array(group, 'anarray1', [2], "Array title 1") + h5file.create_array(group, 'anarray2', [2], "Array title 2") + + # Create a lonely group in first level + h5file.create_group(root, 'agroup2', "Group title 2") + + # Create a new group in the second level + h5file.create_group(group, 'agroup3', "Group title 3") + + def test00_simple(self): + """Checking simple do/undo.""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test00_simple..." % self.__class__.__name__) + + # open the do/undo + self.h5file.enable_undo() + + # Create a new array + self.h5file.create_array('/', 'otherarray', [3, 4], "Another array") + + # Now undo the past operation + self.h5file.undo() + + # Check that otherarray does not exist in the object tree + self.assertNotIn("/otherarray", self.h5file) + self.assertEqual(self.h5file._curaction, 0) + self.assertEqual(self.h5file._curmark, 0) + + # Redo the operation + self._do_reopen() + self.h5file.redo() + if common.verbose: + print("Object tree after redo:", self.h5file) + + # Check that otherarray has come back to life in a sane state + self.assertIn("/otherarray", self.h5file) + self.assertEqual(self.h5file.root.otherarray.read(), [3, 4]) + self.assertEqual(self.h5file.root.otherarray.title, "Another array") + self.assertEqual(self.h5file._curaction, 1) + self.assertEqual(self.h5file._curmark, 0) + + def test01_twice(self): + """Checking do/undo (twice operations intertwined)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test01_twice..." % self.__class__.__name__) + + # open the do/undo + self.h5file.enable_undo() + + # Create a new array + self.h5file.create_array('/', 'otherarray', [3, 4], "Another array") + self.h5file.create_array('/', 'otherarray2', [4, 5], "Another array 2") + + # Now undo the past operations + self._do_reopen() + self.h5file.undo() + self.assertNotIn("/otherarray", self.h5file) + self.assertNotIn("/otherarray2", self.h5file) + self.assertEqual(self.h5file._curaction, 0) + self.assertEqual(self.h5file._curmark, 0) + + # Redo the operation + self.h5file.redo() + + # Check that otherarray has come back to life in a sane state + self.assertIn("/otherarray", self.h5file) + self.assertIn("/otherarray2", self.h5file) + self.assertEqual(self.h5file.root.otherarray.read(), [3, 4]) + self.assertEqual(self.h5file.root.otherarray2.read(), [4, 5]) + self.assertEqual(self.h5file.root.otherarray.title, "Another array") + self.assertEqual(self.h5file.root.otherarray2.title, "Another array 2") + self.assertEqual(self.h5file._curaction, 2) + self.assertEqual(self.h5file._curmark, 0) + + def test02_twice2(self): + """Checking twice ops and two marks.""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test02_twice2..." % self.__class__.__name__) + + # open the do/undo + self.h5file.enable_undo() + + # Create a new array + self.h5file.create_array('/', 'otherarray', [3, 4], "Another array") + + # Put a mark + self._do_reopen() + self.h5file.mark() + self.h5file.create_array('/', 'otherarray2', [4, 5], "Another array 2") + self.assertEqual(self.h5file._curaction, 3) + self.assertEqual(self.h5file._curmark, 1) + + # Unwind just one mark + self.h5file.undo() + self.assertIn("/otherarray", self.h5file) + self.assertNotIn("/otherarray2", self.h5file) + self.assertEqual(self.h5file._curaction, 2) + self.assertEqual(self.h5file._curmark, 1) + + # Unwind another mark + self.h5file.undo() + self.assertEqual(self.h5file._curaction, 0) + self.assertEqual(self.h5file._curmark, 0) + self.assertNotIn("/otherarray", self.h5file) + self.assertNotIn("/otherarray2", self.h5file) + + # Redo until the next mark + self.h5file.redo() + self.assertIn("/otherarray", self.h5file) + self.assertNotIn("/otherarray2", self.h5file) + self._do_reopen() + self.assertEqual(self.h5file._curaction, 2) + self.assertEqual(self.h5file._curmark, 1) + + # Redo until the end + self.h5file.redo() + self.assertIn("/otherarray", self.h5file) + self.assertIn("/otherarray2", self.h5file) + self.assertEqual(self.h5file.root.otherarray.read(), [3, 4]) + self.assertEqual(self.h5file.root.otherarray2.read(), [4, 5]) + self.assertEqual(self.h5file.root.otherarray.title, "Another array") + self.assertEqual(self.h5file.root.otherarray2.title, "Another array 2") + self.assertEqual(self.h5file._curaction, 3) + self.assertEqual(self.h5file._curmark, 1) + + def test03_6times3marks(self): + """Checking with six ops and three marks.""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test03_6times3marks..." % + self.__class__.__name__) + + # open the do/undo + self.h5file.enable_undo() + + # Create a new array + self.h5file.create_array('/', 'otherarray1', [3, 4], "Another array 1") + self.h5file.create_array('/', 'otherarray2', [4, 5], "Another array 2") + + # Put a mark + self.h5file.mark() + self.h5file.create_array('/', 'otherarray3', [5, 6], "Another array 3") + self.h5file.create_array('/', 'otherarray4', [6, 7], "Another array 4") + + # Put a mark + self._do_reopen() + self.h5file.mark() + self.h5file.create_array('/', 'otherarray5', [7, 8], "Another array 5") + self.h5file.create_array('/', 'otherarray6', [8, 9], "Another array 6") + + # Unwind just one mark + self.h5file.undo() + self.assertIn("/otherarray1", self.h5file) + self.assertIn("/otherarray2", self.h5file) + self.assertIn("/otherarray3", self.h5file) + self.assertIn("/otherarray4", self.h5file) + self.assertNotIn("/otherarray5", self.h5file) + self.assertNotIn("/otherarray6", self.h5file) + + # Unwind another mark + self.h5file.undo() + self.assertIn("/otherarray1", self.h5file) + self.assertIn("/otherarray2", self.h5file) + self.assertNotIn("/otherarray3", self.h5file) + self.assertNotIn("/otherarray4", self.h5file) + self.assertNotIn("/otherarray5", self.h5file) + self.assertNotIn("/otherarray6", self.h5file) + + # Unwind all marks + self.h5file.undo() + self.assertNotIn("/otherarray1", self.h5file) + self.assertNotIn("/otherarray2", self.h5file) + self.assertNotIn("/otherarray3", self.h5file) + self.assertNotIn("/otherarray4", self.h5file) + self.assertNotIn("/otherarray5", self.h5file) + self.assertNotIn("/otherarray6", self.h5file) + + # Redo until the next mark + self._do_reopen() + self.h5file.redo() + self.assertIn("/otherarray1", self.h5file) + self.assertIn("/otherarray2", self.h5file) + self.assertNotIn("/otherarray3", self.h5file) + self.assertNotIn("/otherarray4", self.h5file) + self.assertNotIn("/otherarray5", self.h5file) + self.assertNotIn("/otherarray6", self.h5file) + + # Redo until the next mark + self.h5file.redo() + self.assertIn("/otherarray1", self.h5file) + self.assertIn("/otherarray2", self.h5file) + self.assertIn("/otherarray3", self.h5file) + self.assertIn("/otherarray4", self.h5file) + self.assertNotIn("/otherarray5", self.h5file) + self.assertNotIn("/otherarray6", self.h5file) + + # Redo until the end + self.h5file.redo() + self.assertIn("/otherarray1", self.h5file) + self.assertIn("/otherarray2", self.h5file) + self.assertIn("/otherarray3", self.h5file) + self.assertIn("/otherarray4", self.h5file) + self.assertIn("/otherarray5", self.h5file) + self.assertIn("/otherarray6", self.h5file) + self.assertEqual(self.h5file.root.otherarray1.read(), [3, 4]) + self.assertEqual(self.h5file.root.otherarray2.read(), [4, 5]) + self.assertEqual(self.h5file.root.otherarray3.read(), [5, 6]) + self.assertEqual(self.h5file.root.otherarray4.read(), [6, 7]) + self.assertEqual(self.h5file.root.otherarray5.read(), [7, 8]) + self.assertEqual(self.h5file.root.otherarray6.read(), [8, 9]) + self.assertEqual(self.h5file.root.otherarray1.title, "Another array 1") + self.assertEqual(self.h5file.root.otherarray2.title, "Another array 2") + self.assertEqual(self.h5file.root.otherarray3.title, "Another array 3") + self.assertEqual(self.h5file.root.otherarray4.title, "Another array 4") + self.assertEqual(self.h5file.root.otherarray5.title, "Another array 5") + self.assertEqual(self.h5file.root.otherarray6.title, "Another array 6") + + def test04_6times3marksro(self): + """Checking with six operations, three marks and do/undo in random + order.""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test04_6times3marksro..." % + self.__class__.__name__) + + # open the do/undo + self.h5file.enable_undo() + + # Create a new array + self.h5file.create_array('/', 'otherarray1', [3, 4], "Another array 1") + self.h5file.create_array('/', 'otherarray2', [4, 5], "Another array 2") + + # Put a mark + self.h5file.mark() + self._do_reopen() + self.h5file.create_array('/', 'otherarray3', [5, 6], "Another array 3") + self.h5file.create_array('/', 'otherarray4', [6, 7], "Another array 4") + + # Unwind the previous mark + self.h5file.undo() + self.assertIn("/otherarray1", self.h5file) + self.assertIn("/otherarray2", self.h5file) + self.assertNotIn("/otherarray3", self.h5file) + self.assertNotIn("/otherarray4", self.h5file) + + # Put a mark in the middle of stack + if common.verbose: + print("All nodes:", self.h5file.walk_nodes()) + self.h5file.mark() + self._do_reopen() + self.h5file.create_array('/', 'otherarray5', [7, 8], "Another array 5") + self.h5file.create_array('/', 'otherarray6', [8, 9], "Another array 6") + self.assertIn("/otherarray1", self.h5file) + self.assertIn("/otherarray2", self.h5file) + self.assertNotIn("/otherarray3", self.h5file) + self.assertNotIn("/otherarray4", self.h5file) + self.assertIn("/otherarray5", self.h5file) + self.assertIn("/otherarray6", self.h5file) + + # Unwind previous mark + self.h5file.undo() + self.assertIn("/otherarray1", self.h5file) + self.assertIn("/otherarray2", self.h5file) + self.assertNotIn("/otherarray3", self.h5file) + self.assertNotIn("/otherarray4", self.h5file) + self.assertNotIn("/otherarray5", self.h5file) + self.assertNotIn("/otherarray6", self.h5file) + + # Redo until the last mark + self.h5file.redo() + self.assertIn("/otherarray1", self.h5file) + self.assertIn("/otherarray2", self.h5file) + self.assertNotIn("/otherarray3", self.h5file) + self.assertNotIn("/otherarray4", self.h5file) + self.assertIn("/otherarray5", self.h5file) + self.assertIn("/otherarray6", self.h5file) + + # Redo until the next mark (non-existent, so no action) + self._do_reopen() + self.h5file.redo() + self.assertIn("/otherarray1", self.h5file) + self.assertIn("/otherarray2", self.h5file) + self.assertNotIn("/otherarray3", self.h5file) + self.assertNotIn("/otherarray4", self.h5file) + self.assertIn("/otherarray5", self.h5file) + self.assertIn("/otherarray6", self.h5file) + self.assertEqual(self.h5file.root.otherarray1.read(), [3, 4]) + self.assertEqual(self.h5file.root.otherarray2.read(), [4, 5]) + self.assertEqual(self.h5file.root.otherarray5.read(), [7, 8]) + self.assertEqual(self.h5file.root.otherarray6.read(), [8, 9]) + self.assertEqual(self.h5file.root.otherarray1.title, "Another array 1") + self.assertEqual(self.h5file.root.otherarray2.title, "Another array 2") + self.assertEqual(self.h5file.root.otherarray5.title, "Another array 5") + self.assertEqual(self.h5file.root.otherarray6.title, "Another array 6") + + def test05_destructive(self): + """Checking with a destructive action during undo.""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test05_destructive..." % self.__class__.__name__) + + # open the do/undo + self.h5file.enable_undo() + + # Create a new array + self.h5file.create_array('/', 'otherarray1', [3, 4], "Another array 1") + + # Put a mark + self.h5file.mark() + self._do_reopen() + self.h5file.create_array('/', 'otherarray2', [4, 5], "Another array 2") + + # Now undo the past operation + self.h5file.undo() + + # Do the destructive operation + self._do_reopen() + self.h5file.create_array('/', 'otherarray3', [5, 6], "Another array 3") + + # Check objects + self.assertIn("/otherarray1", self.h5file) + self.assertEqual(self.h5file.root.otherarray1.read(), [3, 4]) + self.assertEqual(self.h5file.root.otherarray1.title, "Another array 1") + self.assertNotIn("/otherarray2", self.h5file) + self.assertIn("/otherarray3", self.h5file) + self.assertEqual(self.h5file.root.otherarray3.read(), [5, 6]) + self.assertEqual(self.h5file.root.otherarray3.title, "Another array 3") + + def test05b_destructive(self): + """Checking with a destructive action during undo (II)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test05b_destructive..." % + self.__class__.__name__) + + # open the do/undo + self.h5file.enable_undo() + + # Create a new array + self.h5file.create_array('/', 'otherarray1', [3, 4], "Another array 1") + + # Put a mark + self._do_reopen() + self.h5file.mark() + self.h5file.create_array('/', 'otherarray2', [4, 5], "Another array 2") + + # Now undo the past operation + self.h5file.undo() + + # Do the destructive operation + self.h5file.create_array('/', 'otherarray3', [5, 6], "Another array 3") + + # Put a mark + self._do_reopen() + self.h5file.mark() + self.h5file.create_array('/', 'otherarray4', [6, 7], "Another array 4") + self.assertIn("/otherarray4", self.h5file) + + # Now undo the past operation + self.h5file.undo() + + # Check objects + self.assertIn("/otherarray1", self.h5file) + self.assertEqual(self.h5file.root.otherarray1.read(), [3, 4]) + self.assertEqual(self.h5file.root.otherarray1.title, "Another array 1") + self.assertNotIn("/otherarray2", self.h5file) + self.assertIn("/otherarray3", self.h5file) + self.assertEqual(self.h5file.root.otherarray3.read(), [5, 6]) + self.assertEqual(self.h5file.root.otherarray3.title, "Another array 3") + self.assertNotIn("/otherarray4", self.h5file) + + def test05c_destructive(self): + """Checking with a destructive action during undo (III)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test05c_destructive..." % + self.__class__.__name__) + + # open the do/undo + self.h5file.enable_undo() + + # Create a new array + self.h5file.create_array('/', 'otherarray1', [3, 4], "Another array 1") + + # Put a mark + self.h5file.mark() + self._do_reopen() + self.h5file.create_array('/', 'otherarray2', [4, 5], "Another array 2") + + # Now undo the past operation + self.h5file.undo() + + # Do the destructive operation + self.h5file.create_array('/', 'otherarray3', [5, 6], "Another array 3") + + # Put a mark + self.h5file.mark() + self._do_reopen() + self.h5file.create_array('/', 'otherarray4', [6, 7], "Another array 4") + self.assertIn("/otherarray4", self.h5file) + + # Now unwind twice + self.h5file.undo() + self._do_reopen() + self.h5file.undo() + + # Check objects + self.assertIn("/otherarray1", self.h5file) + self.assertNotIn("/otherarray2", self.h5file) + self.assertNotIn("/otherarray3", self.h5file) + self.assertNotIn("/otherarray4", self.h5file) + + def test05d_destructive(self): + """Checking with a destructive action during undo (IV)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test05d_destructive..." % + self.__class__.__name__) + + # open the do/undo + self.h5file.enable_undo() + + # Create a new array + self.h5file.create_array('/', 'otherarray1', [3, 4], "Another array 1") + + # Put a mark + self._do_reopen() + self.h5file.mark() + self.h5file.create_array('/', 'otherarray2', [4, 5], "Another array 2") + + # Now undo the past operation + self.h5file.undo() + + # Do the destructive operation + self.h5file.create_array('/', 'otherarray3', [5, 6], "Another array 3") + + # Put a mark + self.h5file.mark() + self.h5file.create_array('/', 'otherarray4', [6, 7], "Another array 4") + self.assertIn("/otherarray4", self.h5file) + + # Now, go to the first mark + self._do_reopen() + self.h5file.undo(0) + + # Check objects + self.assertNotIn("/otherarray1", self.h5file) + self.assertNotIn("/otherarray2", self.h5file) + self.assertNotIn("/otherarray3", self.h5file) + self.assertNotIn("/otherarray4", self.h5file) + + def test05e_destructive(self): + """Checking with a destructive action during undo (V)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test05e_destructive..." % + self.__class__.__name__) + + # open the do/undo + self.h5file.enable_undo() + + # Create a new array + self.h5file.create_array('/', 'otherarray1', [3, 4], "Another array 1") + + # Put a mark + self.h5file.mark() + self.h5file.create_array('/', 'otherarray2', [4, 5], "Another array 2") + + # Now undo the past operation + self.h5file.undo() + self._do_reopen() + + # Do the destructive operation + self.h5file.create_array('/', 'otherarray3', [5, 6], "Another array 3") + + # Now, unwind the actions + self.h5file.undo(0) + self._do_reopen() + + # Check objects + self.assertNotIn("/otherarray1", self.h5file) + self.assertNotIn("/otherarray2", self.h5file) + self.assertNotIn("/otherarray3", self.h5file) + + def test05f_destructive(self): + """Checking with a destructive creation of existing node during undo""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test05f_destructive..." % + self.__class__.__name__) + + self.h5file.enable_undo() + self.h5file.create_array('/', 'newarray', [1]) + self.h5file.undo() + self._do_reopen() + self.assertNotIn('/newarray', self.h5file) + newarr = self.h5file.create_array('/', 'newarray', [1]) + self.h5file.undo() + self.assertNotIn('/newarray', self.h5file) + self._do_reopen() + self.h5file.redo() + self.assertIn('/newarray', self.h5file) + if not self._reopen_flag: + self.assertIs(self.h5file.root.newarray, newarr) + + def test06_totalunwind(self): + """Checking do/undo (total unwind)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test06_totalunwind..." % self.__class__.__name__) + + # open the do/undo + self.h5file.enable_undo() + + # Create a new array + self.h5file.create_array('/', 'otherarray', [3, 4], "Another array") + self.h5file.mark() + self.h5file.create_array('/', 'otherarray2', [4, 5], "Another array 2") + + # Now undo the past operations + self._do_reopen() + self.h5file.undo(0) + self.assertNotIn("/otherarray", self.h5file) + self.assertNotIn("/otherarray2", self.h5file) + + def test07_totalrewind(self): + """Checking do/undo (total rewind)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test07_totalunwind..." % self.__class__.__name__) + + # open the do/undo + self.h5file.enable_undo() + + # Create a new array + self.h5file.create_array('/', 'otherarray', [3, 4], "Another array") + self.h5file.mark() + self.h5file.create_array('/', 'otherarray2', [4, 5], "Another array 2") + + # Now undo the past operations + self.h5file.undo(0) + + # Redo all the operations + self._do_reopen() + self.h5file.redo(-1) + + # Check that objects has come back to life in a sane state + self.assertIn("/otherarray", self.h5file) + self.assertIn("/otherarray2", self.h5file) + self.assertEqual(self.h5file.root.otherarray.read(), [3, 4]) + self.assertEqual(self.h5file.root.otherarray2.read(), [4, 5]) + self.assertEqual(self.h5file.root.otherarray.title, "Another array") + self.assertEqual(self.h5file.root.otherarray2.title, "Another array 2") + + def test08_marknames(self): + """Checking mark names.""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test08_marknames..." % self.__class__.__name__) + + # open the do/undo + self.h5file.enable_undo() + + # Create a new array + self.h5file.create_array('/', 'otherarray1', [3, 4], "Another array 1") + self.h5file.mark("first") + self.h5file.create_array('/', 'otherarray2', [4, 5], "Another array 2") + self.h5file.mark("second") + self.h5file.create_array('/', 'otherarray3', [5, 6], "Another array 3") + self.h5file.mark("third") + self.h5file.create_array('/', 'otherarray4', [6, 7], "Another array 4") + + # Now go to mark "first" + self.h5file.undo("first") + self._do_reopen() + self.assertIn("/otherarray1", self.h5file) + self.assertNotIn("/otherarray2", self.h5file) + self.assertNotIn("/otherarray3", self.h5file) + self.assertNotIn("/otherarray4", self.h5file) + + # Go to mark "third" + self.h5file.redo("third") + self.assertIn("/otherarray1", self.h5file) + self.assertIn("/otherarray2", self.h5file) + self.assertIn("/otherarray3", self.h5file) + self.assertNotIn("/otherarray4", self.h5file) + + # Now go to mark "second" + self.h5file.undo("second") + self.assertIn("/otherarray1", self.h5file) + self.assertIn("/otherarray2", self.h5file) + self.assertNotIn("/otherarray3", self.h5file) + self.assertNotIn("/otherarray4", self.h5file) + + # Go to the end + self._do_reopen() + self.h5file.redo(-1) + self.assertIn("/otherarray1", self.h5file) + self.assertIn("/otherarray2", self.h5file) + self.assertIn("/otherarray3", self.h5file) + self.assertIn("/otherarray4", self.h5file) + + # Check that objects has come back to life in a sane state + self.assertEqual(self.h5file.root.otherarray1.read(), [3, 4]) + self.assertEqual(self.h5file.root.otherarray2.read(), [4, 5]) + self.assertEqual(self.h5file.root.otherarray3.read(), [5, 6]) + self.assertEqual(self.h5file.root.otherarray4.read(), [6, 7]) + + def test08_initialmark(self): + """Checking initial mark.""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test08_initialmark..." % self.__class__.__name__) + + # open the do/undo + self.h5file.enable_undo() + initmid = self.h5file.get_current_mark() + + # Create a new array + self.h5file.create_array('/', 'otherarray', [3, 4], "Another array") + self.h5file.mark() + self._do_reopen() + self.h5file.create_array('/', 'otherarray2', [4, 5], "Another array 2") + + # Now undo the past operations + self.h5file.undo(initmid) + self.assertNotIn("/otherarray", self.h5file) + self.assertNotIn("/otherarray2", self.h5file) + + # Redo all the operations + self.h5file.redo(-1) + self._do_reopen() + + # Check that objects has come back to life in a sane state + self.assertIn("/otherarray", self.h5file) + self.assertIn("/otherarray2", self.h5file) + self.assertEqual(self.h5file.root.otherarray.read(), [3, 4]) + self.assertEqual(self.h5file.root.otherarray2.read(), [4, 5]) + self.assertEqual(self.h5file.root.otherarray.title, "Another array") + self.assertEqual(self.h5file.root.otherarray2.title, "Another array 2") + + def test09_marknames(self): + """Checking mark names (wrong direction)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test09_marknames..." % self.__class__.__name__) + + # open the do/undo + self.h5file.enable_undo() + + # Create a new array + self.h5file.create_array('/', 'otherarray1', [3, 4], "Another array 1") + self.h5file.mark("first") + self.h5file.create_array('/', 'otherarray2', [4, 5], "Another array 2") + self.h5file.mark("second") + self._do_reopen() + self.h5file.create_array('/', 'otherarray3', [5, 6], "Another array 3") + self.h5file.mark("third") + self.h5file.create_array('/', 'otherarray4', [6, 7], "Another array 4") + + # Now go to mark "first" + self.h5file.undo("first") + + # Try to undo up to mark "third" + with self.assertRaises(tb.UndoRedoError): + self.h5file.undo("third") + + # Now go to mark "third" + self.h5file.redo("third") + self._do_reopen() + + # Try to redo up to mark "second" + with self.assertRaises(tb.UndoRedoError): + self.h5file.redo("second") + + # Final checks + self.assertIn("/otherarray1", self.h5file) + self.assertIn("/otherarray2", self.h5file) + self.assertIn("/otherarray3", self.h5file) + self.assertNotIn("/otherarray4", self.h5file) + + def test10_goto(self): + """Checking mark names (goto)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test10_goto..." % self.__class__.__name__) + + # open the do/undo + self.h5file.enable_undo() + + # Create a new array + self.h5file.create_array('/', 'otherarray1', [3, 4], "Another array 1") + self._do_reopen() + self.h5file.mark("first") + self.h5file.create_array('/', 'otherarray2', [4, 5], "Another array 2") + self.h5file.mark("second") + self.h5file.create_array('/', 'otherarray3', [5, 6], "Another array 3") + self._do_reopen() + self.h5file.mark("third") + self.h5file.create_array('/', 'otherarray4', [6, 7], "Another array 4") + + # Now go to mark "first" + self.h5file.goto("first") + self.assertIn("/otherarray1", self.h5file) + self.assertNotIn("/otherarray2", self.h5file) + self.assertNotIn("/otherarray3", self.h5file) + self.assertNotIn("/otherarray4", self.h5file) + + # Go to mark "third" + self.h5file.goto("third") + self.assertIn("/otherarray1", self.h5file) + self.assertIn("/otherarray2", self.h5file) + self.assertIn("/otherarray3", self.h5file) + self.assertNotIn("/otherarray4", self.h5file) + + # Now go to mark "second" + self._do_reopen() + self.h5file.goto("second") + self.assertIn("/otherarray1", self.h5file) + self.assertIn("/otherarray2", self.h5file) + self.assertNotIn("/otherarray3", self.h5file) + self.assertNotIn("/otherarray4", self.h5file) + + # Go to the end + self.h5file.goto(-1) + self.assertIn("/otherarray1", self.h5file) + self.assertIn("/otherarray2", self.h5file) + self.assertIn("/otherarray3", self.h5file) + self.assertIn("/otherarray4", self.h5file) + + # Check that objects has come back to life in a sane state + self.assertIn("/otherarray2", self.h5file) + self.assertEqual(self.h5file.root.otherarray1.read(), [3, 4]) + self.assertEqual(self.h5file.root.otherarray2.read(), [4, 5]) + self.assertEqual(self.h5file.root.otherarray3.read(), [5, 6]) + self.assertEqual(self.h5file.root.otherarray4.read(), [6, 7]) + + def test10_gotoint(self): + """Checking mark sequential ids (goto)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test10_gotoint..." % self.__class__.__name__) + + # open the do/undo + self.h5file.enable_undo() + + # Create a new array + self.h5file.create_array('/', 'otherarray1', [3, 4], "Another array 1") + self.h5file.mark("first") + self.h5file.create_array('/', 'otherarray2', [4, 5], "Another array 2") + self.h5file.mark("second") + self._do_reopen() + self.h5file.create_array('/', 'otherarray3', [5, 6], "Another array 3") + self.h5file.mark("third") + self.h5file.create_array('/', 'otherarray4', [6, 7], "Another array 4") + + # Now go to mark "first" + self.h5file.goto(1) + self._do_reopen() + self.assertIn("/otherarray1", self.h5file) + self.assertNotIn("/otherarray2", self.h5file) + self.assertNotIn("/otherarray3", self.h5file) + self.assertNotIn("/otherarray4", self.h5file) + + # Go to beginning + self.h5file.goto(0) + self.assertNotIn("/otherarray1", self.h5file) + self.assertNotIn("/otherarray2", self.h5file) + self.assertNotIn("/otherarray3", self.h5file) + self.assertNotIn("/otherarray4", self.h5file) + + # Go to mark "third" + self._do_reopen() + self.h5file.goto(3) + self.assertIn("/otherarray1", self.h5file) + self.assertIn("/otherarray2", self.h5file) + self.assertIn("/otherarray3", self.h5file) + self.assertNotIn("/otherarray4", self.h5file) + + # Now go to mark "second" + self.h5file.goto(2) + self.assertIn("/otherarray1", self.h5file) + self.assertIn("/otherarray2", self.h5file) + self.assertNotIn("/otherarray3", self.h5file) + self.assertNotIn("/otherarray4", self.h5file) + + # Go to the end + self._do_reopen() + self.h5file.goto(-1) + self.assertIn("/otherarray1", self.h5file) + self.assertIn("/otherarray2", self.h5file) + self.assertIn("/otherarray3", self.h5file) + self.assertIn("/otherarray4", self.h5file) + + # Check that objects has come back to life in a sane state + self.assertIn("/otherarray2", self.h5file) + self.assertEqual(self.h5file.root.otherarray1.read(), [3, 4]) + self.assertEqual(self.h5file.root.otherarray2.read(), [4, 5]) + self.assertEqual(self.h5file.root.otherarray3.read(), [5, 6]) + self.assertEqual(self.h5file.root.otherarray4.read(), [6, 7]) + + def test11_contiguous(self): + """Creating contiguous marks""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test11_contiguous..." % self.__class__.__name__) + + self.h5file.enable_undo() + m1 = self.h5file.mark() + m2 = self.h5file.mark() + self.assertNotEqual(m1, m2) + self._do_reopen() + self.h5file.undo(m1) + self.assertEqual(self.h5file.get_current_mark(), m1) + self.h5file.redo(m2) + self.assertEqual(self.h5file.get_current_mark(), m2) + self.h5file.goto(m1) + self.assertEqual(self.h5file.get_current_mark(), m1) + self.h5file.goto(m2) + self.assertEqual(self.h5file.get_current_mark(), m2) + self.h5file.goto(-1) + self._do_reopen() + self.assertEqual(self.h5file.get_current_mark(), m2) + self.h5file.goto(0) + self.assertEqual(self.h5file.get_current_mark(), 0) + + def test12_keepMark(self): + """Ensuring the mark is kept after an UNDO operation""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test12_keepMark..." % self.__class__.__name__) + + self.h5file.enable_undo() + self.h5file.create_array('/', 'newarray1', [1]) + + mid = self.h5file.mark() + self.assertIsNotNone(mid) + self._do_reopen() + self.h5file.undo() + + # We should have moved to the initial mark. + self.assertEqual(self.h5file.get_current_mark(), 0) + + # So /newarray1 should not be there. + self.assertNotIn('/newarray1', self.h5file) + + def test13_severalEnableDisable(self): + """Checking that successive enable/disable Undo works""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test13_severalEnableDisable..." % + self.__class__.__name__) + + self.h5file.enable_undo() + self.h5file.create_array('/', 'newarray1', [1]) + self.h5file.undo() + self._do_reopen() + + # We should have moved to 'mid' mark, not the initial mark. + self.assertEqual(self.h5file.get_current_mark(), 0) + + # So /newarray1 should still be there. + self.assertNotIn('/newarray1', self.h5file) + + # Close this do/undo session + self.h5file.disable_undo() + + # Do something + self.h5file.create_array('/', 'newarray2', [1]) + + # Enable again do/undo + self.h5file.enable_undo() + self.h5file.create_array('/', 'newarray3', [1]) + mid = self.h5file.mark() + self.h5file.create_array('/', 'newarray4', [1]) + self.h5file.undo() + + # We should have moved to 'mid' mark, not the initial mark. + self.assertEqual(self.h5file.get_current_mark(), mid) + + # So /newarray2 and /newarray3 should still be there. + self.assertNotIn('/newarray1', self.h5file) + self.assertIn('/newarray2', self.h5file) + self.assertIn('/newarray3', self.h5file) + self.assertNotIn('/newarray4', self.h5file) + + # Close this do/undo session + self._do_reopen() + self.h5file.disable_undo() + + # Enable again do/undo + self.h5file.enable_undo() + self.h5file.create_array('/', 'newarray1', [1]) + self.h5file.create_array('/', 'newarray4', [1]) + + # So /newarray2 and /newarray3 should still be there. + self.assertIn('/newarray1', self.h5file) + self.assertIn('/newarray2', self.h5file) + self.assertIn('/newarray3', self.h5file) + self.assertIn('/newarray4', self.h5file) + self.h5file.undo() + self._do_reopen() + self.assertNotIn('/newarray1', self.h5file) + self.assertIn('/newarray2', self.h5file) + self.assertIn('/newarray3', self.h5file) + self.assertNotIn('/newarray4', self.h5file) + + # Close this do/undo session + self.h5file.disable_undo() + + +class PersistenceTestCase(BasicTestCase): + """Test for basic Undo/Redo operations with persistence.""" + + _reopen_flag = True + + +class CreateArrayTestCase(common.TempFileMixin, common.PyTablesTestCase): + """Test for create_array operations""" + + def setUp(self): + super().setUp() + + h5file = self.h5file + root = h5file.root + + # Create an array + h5file.create_array(root, 'array', [1, 2], title="Title example") + + # Create another array object + h5file.create_array(root, 'anarray', [1], "Array title") + + # Create a group object + group = h5file.create_group(root, 'agroup', "Group title") + + # Create a couple of objects there + h5file.create_array(group, 'anarray1', [2], "Array title 1") + h5file.create_array(group, 'anarray2', [2], "Array title 2") + + # Create a lonely group in first level + h5file.create_group(root, 'agroup2', "Group title 2") + + # Create a new group in the second level + h5file.create_group(group, 'agroup3', "Group title 3") + + def test00(self): + """Checking one action.""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test00..." % self.__class__.__name__) + + # open the do/undo + self.h5file.enable_undo() + + # Create a new array + self.h5file.create_array('/', 'otherarray1', [1, 2], "Another array 1") + + # Now undo the past operation + self.h5file.undo() + + # Check that otherarray does not exist in the object tree + self.assertNotIn("/otherarray1", self.h5file) + + # Redo the operation + self.h5file.redo() + + # Check that otherarray has come back to life in a sane state + self.assertIn("/otherarray1", self.h5file) + self.assertEqual(self.h5file.root.otherarray1.title, "Another array 1") + self.assertEqual(self.h5file.root.otherarray1.read(), [1, 2]) + + def test01(self): + """Checking two actions.""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test01..." % self.__class__.__name__) + + # open the do/undo + self.h5file.enable_undo() + + # Create a new array + self.h5file.create_array('/', 'otherarray1', [1, 2], "Another array 1") + self.h5file.create_array('/', 'otherarray2', [2, 3], "Another array 2") + + # Now undo the past operation + self.h5file.undo() + + # Check that otherarray does not exist in the object tree + self.assertNotIn("/otherarray1", self.h5file) + self.assertNotIn("/otherarray2", self.h5file) + + # Redo the operation + self.h5file.redo() + + # Check that otherarray has come back to life in a sane state + self.assertIn("/otherarray1", self.h5file) + self.assertIn("/otherarray2", self.h5file) + self.assertEqual(self.h5file.root.otherarray1.title, "Another array 1") + self.assertEqual(self.h5file.root.otherarray2.title, "Another array 2") + self.assertEqual(self.h5file.root.otherarray1.read(), [1, 2]) + self.assertEqual(self.h5file.root.otherarray2.read(), [2, 3]) + + def test02(self): + """Checking three actions.""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test02..." % self.__class__.__name__) + + # open the do/undo + self.h5file.enable_undo() + + # Create a new array + self.h5file.create_array('/', 'otherarray1', [1, 2], "Another array 1") + self.h5file.create_array('/', 'otherarray2', [2, 3], "Another array 2") + self.h5file.create_array('/', 'otherarray3', [3, 4], "Another array 3") + + # Now undo the past operation + self.h5file.undo() + + # Check that otherarray does not exist in the object tree + self.assertNotIn("/otherarray1", self.h5file) + self.assertNotIn("/otherarray2", self.h5file) + self.assertNotIn("/otherarray3", self.h5file) + + # Redo the operation + self.h5file.redo() + + # Check that otherarray has come back to life in a sane state + self.assertIn("/otherarray1", self.h5file) + self.assertIn("/otherarray2", self.h5file) + self.assertIn("/otherarray3", self.h5file) + self.assertEqual(self.h5file.root.otherarray1.title, "Another array 1") + self.assertEqual(self.h5file.root.otherarray2.title, "Another array 2") + self.assertEqual(self.h5file.root.otherarray3.title, "Another array 3") + self.assertEqual(self.h5file.root.otherarray1.read(), [1, 2]) + self.assertEqual(self.h5file.root.otherarray2.read(), [2, 3]) + self.assertEqual(self.h5file.root.otherarray3.read(), [3, 4]) + + def test03(self): + """Checking three actions in different depth levels.""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test03..." % self.__class__.__name__) + + # open the do/undo + self.h5file.enable_undo() + + # Create a new array + self.h5file.create_array('/', 'otherarray1', [1, 2], "Another array 1") + self.h5file.create_array('/agroup', 'otherarray2', + [2, 3], "Another array 2") + self.h5file.create_array('/agroup/agroup3', 'otherarray3', + [3, 4], "Another array 3") + + # Now undo the past operation + self.h5file.undo() + + # Check that otherarray does not exist in the object tree + self.assertNotIn("/otherarray1", self.h5file) + self.assertNotIn("/agroup/otherarray2", self.h5file) + self.assertNotIn("/agroup/agroup3/otherarray3", self.h5file) + + # Redo the operation + self.h5file.redo() + + # Check that otherarray has come back to life in a sane state + self.assertIn("/otherarray1", self.h5file) + self.assertIn("/agroup/otherarray2", self.h5file) + self.assertIn("/agroup/agroup3/otherarray3", self.h5file) + self.assertEqual(self.h5file.root.otherarray1.title, "Another array 1") + self.assertEqual(self.h5file.root.agroup.otherarray2.title, + "Another array 2") + self.assertEqual(self.h5file.root.agroup.agroup3.otherarray3.title, + "Another array 3") + self.assertEqual(self.h5file.root.otherarray1.read(), [1, 2]) + self.assertEqual(self.h5file.root.agroup.otherarray2.read(), [2, 3]) + self.assertEqual(self.h5file.root.agroup.agroup3.otherarray3.read(), + [3, 4]) + + +class CreateGroupTestCase(common.TempFileMixin, common.PyTablesTestCase): + """Test for create_group operations""" + + def setUp(self): + super().setUp() + + h5file = self.h5file + root = h5file.root + + # Create an array + h5file.create_array(root, 'array', [1, 2], title="Title example") + + # Create another array object + h5file.create_array(root, 'anarray', [1], "Array title") + + # Create a group object + group = h5file.create_group(root, 'agroup', "Group title") + + # Create a couple of objects there + h5file.create_array(group, 'anarray1', [2], "Array title 1") + h5file.create_array(group, 'anarray2', [2], "Array title 2") + + # Create a lonely group in first level + h5file.create_group(root, 'agroup2', "Group title 2") + + # Create a new group in the second level + h5file.create_group(group, 'agroup3', "Group title 3") + + def test00(self): + """Checking one action.""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test00..." % self.__class__.__name__) + + # open the do/undo + self.h5file.enable_undo() + + # Create a new group + self.h5file.create_group('/', 'othergroup1', "Another group 1") + + # Now undo the past operation + self.h5file.undo() + + # Check that othergroup1 does not exist in the object tree + self.assertNotIn("/othergroup1", self.h5file) + + # Redo the operation + self.h5file.redo() + + # Check that othergroup1 has come back to life in a sane state + self.assertIn("/othergroup1", self.h5file) + self.assertEqual(self.h5file.root.othergroup1._v_title, + "Another group 1") + + def test01(self): + """Checking two actions.""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test01..." % self.__class__.__name__) + + # open the do/undo + self.h5file.enable_undo() + + # Create a new group + self.h5file.create_group('/', 'othergroup1', "Another group 1") + self.h5file.create_group('/', 'othergroup2', "Another group 2") + + # Now undo the past operation + self.h5file.undo() + + # Check that othergroup does not exist in the object tree + self.assertNotIn("/othergroup1", self.h5file) + self.assertNotIn("/othergroup2", self.h5file) + + # Redo the operation + self.h5file.redo() + + # Check that othergroup* has come back to life in a sane state + self.assertIn("/othergroup1", self.h5file) + self.assertIn("/othergroup2", self.h5file) + self.assertEqual(self.h5file.root.othergroup1._v_title, + "Another group 1") + self.assertEqual(self.h5file.root.othergroup2._v_title, + "Another group 2") + + def test02(self): + """Checking three actions.""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test02..." % self.__class__.__name__) + + # open the do/undo + self.h5file.enable_undo() + + # Create a new group + self.h5file.create_group('/', 'othergroup1', "Another group 1") + self.h5file.create_group('/', 'othergroup2', "Another group 2") + self.h5file.create_group('/', 'othergroup3', "Another group 3") + + # Now undo the past operation + self.h5file.undo() + + # Check that othergroup* does not exist in the object tree + self.assertNotIn("/othergroup1", self.h5file) + self.assertNotIn("/othergroup2", self.h5file) + self.assertNotIn("/othergroup3", self.h5file) + + # Redo the operation + self.h5file.redo() + + # Check that othergroup* has come back to life in a sane state + self.assertIn("/othergroup1", self.h5file) + self.assertIn("/othergroup2", self.h5file) + self.assertIn("/othergroup3", self.h5file) + self.assertEqual(self.h5file.root.othergroup1._v_title, + "Another group 1") + self.assertEqual(self.h5file.root.othergroup2._v_title, + "Another group 2") + self.assertEqual(self.h5file.root.othergroup3._v_title, + "Another group 3") + + def test03(self): + """Checking three actions in different depth levels.""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test03..." % self.__class__.__name__) + + # open the do/undo + self.h5file.enable_undo() + + # Create a new group + self.h5file.create_group('/', 'othergroup1', "Another group 1") + self.h5file.create_group( + '/othergroup1', 'othergroup2', "Another group 2") + self.h5file.create_group( + '/othergroup1/othergroup2', 'othergroup3', "Another group 3") + + # Now undo the past operation + self.h5file.undo() + + # Check that othergroup* does not exist in the object tree + self.assertNotIn("/othergroup1", self.h5file) + self.assertNotIn("/othergroup1/othergroup2", self.h5file) + self.assertTrue( + "/othergroup1/othergroup2/othergroup3" not in self.h5file) + + # Redo the operation + self.h5file.redo() + + # Check that othergroup* has come back to life in a sane state + self.assertIn("/othergroup1", self.h5file) + self.assertIn("/othergroup1/othergroup2", self.h5file) + self.assertIn("/othergroup1/othergroup2/othergroup3", self.h5file) + self.assertEqual(self.h5file.root.othergroup1._v_title, + "Another group 1") + self.assertEqual(self.h5file.root.othergroup1.othergroup2._v_title, + "Another group 2") + self.assertEqual( + self.h5file.root.othergroup1.othergroup2.othergroup3._v_title, + "Another group 3") + + +minRowIndex = 10 + + +def populateTable(where, name): + """Create a table under where with name name""" + + class Indexed(tb.IsDescription): + var1 = tb.StringCol(itemsize=4, dflt=b"", pos=1) + var2 = tb.BoolCol(dflt=0, pos=2) + var3 = tb.IntCol(dflt=0, pos=3) + var4 = tb.FloatCol(dflt=0, pos=4) + + nrows = minRowIndex + table = where._v_file.create_table(where, name, Indexed, "Indexed", + None, nrows) + for i in range(nrows): + table.row['var1'] = str(i) + + # table.row['var2'] = i > 2 + table.row['var2'] = i % 2 + table.row['var3'] = i + table.row['var4'] = float(nrows - i - 1) + table.row.append() + table.flush() + + # Index all entries: + indexrows = table.cols.var1.create_index() + indexrows = table.cols.var2.create_index() + indexrows = table.cols.var3.create_index() + + # Do not index the var4 column + # indexrows = table.cols.var4.create_index() + if common.verbose: + print("Number of written rows:", nrows) + print("Number of indexed rows:", table.cols.var1.index.nelements) + print("Number of indexed rows(2):", indexrows) + + +class RenameNodeTestCase(common.TempFileMixin, common.PyTablesTestCase): + """Test for rename_node operations""" + + def setUp(self): + super().setUp() + + h5file = self.h5file + root = h5file.root + + # Create an array + h5file.create_array(root, 'array', [1, 2], title="Title example") + + # Create another array object + h5file.create_array(root, 'anarray', [1], "Array title") + + # Create a group object + group = h5file.create_group(root, 'agroup', "Group title") + + # Create a couple of objects there + h5file.create_array(group, 'anarray1', [2], "Array title 1") + h5file.create_array(group, 'anarray2', [2], "Array title 2") + + # Create a lonely group in first level + h5file.create_group(root, 'agroup2', "Group title 2") + + # Create a new group in the second level + h5file.create_group(group, 'agroup3', "Group title 3") + + # Create a table in root + populateTable(self.h5file.root, 'table') + + def test00(self): + """Checking rename_node (over Groups without children)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test00..." % self.__class__.__name__) + + # open the do/undo + self.h5file.enable_undo() + + # Create a new array + self.h5file.rename_node('/agroup2', 'agroup3') + + # Now undo the past operation + self.h5file.undo() + + # Check that it does not exist in the object tree + self.assertIn("/agroup2", self.h5file) + self.assertNotIn("/agroup3", self.h5file) + self.assertEqual(self.h5file.root.agroup2._v_title, "Group title 2") + + # Redo the operation + self.h5file.redo() + + # Check that otherarray has come back to life in a sane state + self.assertNotIn("/agroup2", self.h5file) + self.assertIn("/agroup3", self.h5file) + self.assertEqual(self.h5file.root.agroup3._v_title, "Group title 2") + + def test01(self): + """Checking rename_node (over Groups with children)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test01..." % self.__class__.__name__) + + # open the do/undo + self.h5file.enable_undo() + + # Create a new array + self.h5file.rename_node('/agroup', 'agroup3') + + # Now undo the past operation + self.h5file.undo() + + # Check that it does not exist in the object tree + self.assertIn("/agroup", self.h5file) + self.assertNotIn("/agroup3", self.h5file) + + # Check that children are reachable + self.assertIn("/agroup/anarray1", self.h5file) + self.assertIn("/agroup/anarray2", self.h5file) + self.assertIn("/agroup/agroup3", self.h5file) + self.assertEqual(self.h5file.root.agroup._v_title, "Group title") + + # Redo the operation + self.h5file.redo() + + # Check that otherarray has come back to life in a sane state + self.assertNotIn("/agroup", self.h5file) + self.assertIn("/agroup3", self.h5file) + self.assertEqual(self.h5file.root.agroup3._v_title, "Group title") + + # Check that children are reachable + self.assertIn("/agroup3/anarray1", self.h5file) + self.assertIn("/agroup3/anarray2", self.h5file) + self.assertIn("/agroup3/agroup3", self.h5file) + + def test01b(self): + """Checking rename_node (over Groups with children 2)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test01b..." % self.__class__.__name__) + + # open the do/undo + self.h5file.enable_undo() + + # Create a new array + self.h5file.rename_node('/agroup', 'agroup3') + self.h5file.rename_node('/agroup3', 'agroup4') + + # Now undo the past operation + self.h5file.undo() + + # Check that it does not exist in the object tree + self.assertIn("/agroup", self.h5file) + self.assertNotIn("/agroup4", self.h5file) + + # Check that children are reachable + self.assertIn("/agroup/anarray1", self.h5file) + self.assertIn("/agroup/anarray2", self.h5file) + self.assertIn("/agroup/agroup3", self.h5file) + self.assertEqual(self.h5file.root.agroup._v_title, "Group title") + + # Redo the operation + self.h5file.redo() + + # Check that otherarray has come back to life in a sane state + self.assertNotIn("/agroup", self.h5file) + self.assertIn("/agroup4", self.h5file) + self.assertEqual(self.h5file.root.agroup4._v_title, "Group title") + + # Check that children are reachable + self.assertIn("/agroup4/anarray1", self.h5file) + self.assertIn("/agroup4/anarray2", self.h5file) + self.assertIn("/agroup4/agroup3", self.h5file) + + def test02(self): + """Checking rename_node (over Leaves)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test02..." % self.__class__.__name__) + + # open the do/undo + self.h5file.enable_undo() + + # Create a new array + self.h5file.rename_node('/anarray', 'anarray2') + + # Now undo the past operation + self.h5file.undo() + + # Check that otherarray does not exist in the object tree + self.assertIn("/anarray", self.h5file) + self.assertNotIn("/anarray2", self.h5file) + self.assertEqual(self.h5file.root.anarray.title, "Array title") + + # Redo the operation + self.h5file.redo() + + # Check that otherarray has come back to life in a sane state + self.assertNotIn("/anarray", self.h5file) + self.assertIn("/anarray2", self.h5file) + self.assertEqual(self.h5file.root.anarray2.title, "Array title") + + def test03(self): + """Checking rename_node (over Tables)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test03..." % self.__class__.__name__) + + # open the do/undo + self.h5file.enable_undo() + + # Create a new array + self.h5file.rename_node('/table', 'table2') + + # Now undo the past operation + self.h5file.undo() + + # Check that table2 does not exist in the object tree + self.assertIn("/table", self.h5file) + table = self.h5file.root.table + self.assertIsNotNone(table.cols.var1.index) + self.assertIsNotNone(table.cols.var2.index) + self.assertIsNotNone(table.cols.var3.index) + self.assertIsNone(table.cols.var4.index) + self.assertEqual(table.cols.var1.index.nelements, minRowIndex) + self.assertEqual(table.cols.var2.index.nelements, minRowIndex) + self.assertEqual(table.cols.var3.index.nelements, minRowIndex) + self.assertNotIn("/table2", self.h5file) + self.assertEqual(self.h5file.root.table.title, "Indexed") + + # Redo the operation + self.h5file.redo() + + # Check that table2 has come back to life in a sane state + self.assertNotIn("/table", self.h5file) + self.assertIn("/table2", self.h5file) + self.assertEqual(self.h5file.root.table2.title, "Indexed") + table = self.h5file.root.table2 + self.assertIsNotNone(table.cols.var1.index) + self.assertIsNotNone(table.cols.var2.index) + self.assertIsNotNone(table.cols.var3.index) + self.assertEqual(table.cols.var1.index.nelements, minRowIndex) + self.assertEqual(table.cols.var2.index.nelements, minRowIndex) + self.assertEqual(table.cols.var3.index.nelements, minRowIndex) + self.assertIsNone(table.cols.var4.index) + + +class MoveNodeTestCase(common.TempFileMixin, common.PyTablesTestCase): + """Tests for move_node operations""" + + def setUp(self): + super().setUp() + + h5file = self.h5file + root = h5file.root + + # Create an array + h5file.create_array(root, 'array', [1, 2], title="Title example") + + # Create another array object + h5file.create_array(root, 'anarray', [1], "Array title") + + # Create a group object + group = h5file.create_group(root, 'agroup', "Group title") + + # Create a couple of objects there + h5file.create_array(group, 'anarray1', [2], "Array title 1") + h5file.create_array(group, 'anarray2', [2], "Array title 2") + + # Create a lonely group in first level + h5file.create_group(root, 'agroup2', "Group title 2") + + # Create a new group in the second level + h5file.create_group(group, 'agroup3', "Group title 3") + + # Create a table in root + populateTable(self.h5file.root, 'table') + + def test00(self): + """Checking move_node (over Leaf)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test00..." % self.__class__.__name__) + + # open the do/undo + self.h5file.enable_undo() + + # Create a new array + self.h5file.move_node('/anarray', '/agroup/agroup3') + + # Now undo the past operation + self.h5file.undo() + + # Check that it does not exist in the object tree + self.assertIn("/anarray", self.h5file) + self.assertNotIn("/agroup/agroup3/anarray", self.h5file) + self.assertEqual(self.h5file.root.anarray.title, "Array title") + + # Redo the operation + self.h5file.redo() + + # Check that otherarray has come back to life in a sane state + self.assertNotIn("/anarray", self.h5file) + self.assertIn("/agroup/agroup3/anarray", self.h5file) + self.assertEqual(self.h5file.root.agroup.agroup3.anarray.title, + "Array title") + + def test01(self): + """Checking move_node (over Groups with children)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test01..." % self.__class__.__name__) + + # open the do/undo + self.h5file.enable_undo() + + # Create a new array + self.h5file.move_node('/agroup', '/agroup2', 'agroup3') + + # Now undo the past operation + self.h5file.undo() + + # Check that it does not exist in the object tree + self.assertIn("/agroup", self.h5file) + self.assertNotIn("/agroup2/agroup3", self.h5file) + + # Check that children are reachable + self.assertIn("/agroup/anarray1", self.h5file) + self.assertIn("/agroup/anarray2", self.h5file) + self.assertIn("/agroup/agroup3", self.h5file) + self.assertEqual(self.h5file.root.agroup._v_title, "Group title") + + # Redo the operation + self.h5file.redo() + + # Check that otherarray has come back to life in a sane state + self.assertNotIn("/agroup", self.h5file) + self.assertIn("/agroup2/agroup3", self.h5file) + self.assertEqual(self.h5file.root.agroup2.agroup3._v_title, + "Group title") + + # Check that children are reachable + self.assertIn("/agroup2/agroup3/anarray1", self.h5file) + self.assertIn("/agroup2/agroup3/anarray2", self.h5file) + self.assertIn("/agroup2/agroup3/agroup3", self.h5file) + + def test01b(self): + """Checking move_node (over Groups with children 2)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test01b..." % self.__class__.__name__) + + # open the do/undo + self.h5file.enable_undo() + + # Create a new array + self.h5file.move_node('/agroup', '/', 'agroup3') + self.h5file.move_node('/agroup3', '/agroup2', 'agroup4') + + # Now undo the past operation + self.h5file.undo() + + # Check that it does not exist in the object tree + self.assertIn("/agroup", self.h5file) + self.assertNotIn("/agroup2/agroup4", self.h5file) + + # Check that children are reachable + self.assertIn("/agroup/anarray1", self.h5file) + self.assertIn("/agroup/anarray2", self.h5file) + self.assertIn("/agroup/agroup3", self.h5file) + self.assertEqual(self.h5file.root.agroup._v_title, "Group title") + + # Redo the operation + self.h5file.redo() + + # Check that otherarray has come back to life in a sane state + self.assertNotIn("/agroup", self.h5file) + self.assertIn("/agroup2/agroup4", self.h5file) + self.assertEqual(self.h5file.root.agroup2.agroup4._v_title, + "Group title") + + # Check that children are reachable + self.assertIn("/agroup2/agroup4/anarray1", self.h5file) + self.assertIn("/agroup2/agroup4/anarray2", self.h5file) + self.assertIn("/agroup2/agroup4/agroup3", self.h5file) + + def test02(self): + """Checking move_node (over Leaves)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test02..." % self.__class__.__name__) + + # open the do/undo + self.h5file.enable_undo() + + # Create a new array + self.h5file.move_node('/anarray', '/agroup2', 'anarray2') + + # Now undo the past operation + self.h5file.undo() + + # Check that otherarray does not exist in the object tree + self.assertIn("/anarray", self.h5file) + self.assertNotIn("/agroup2/anarray2", self.h5file) + self.assertEqual(self.h5file.root.anarray.title, "Array title") + + # Redo the operation + self.h5file.redo() + + # Check that otherarray has come back to life in a sane state + self.assertNotIn("/anarray", self.h5file) + self.assertIn("/agroup2/anarray2", self.h5file) + self.assertEqual( + self.h5file.root.agroup2.anarray2.title, "Array title") + + def test03(self): + """Checking move_node (over Tables)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test03..." % self.__class__.__name__) + + # open the do/undo + self.h5file.enable_undo() + + # Create a new array + self.h5file.move_node('/table', '/agroup2', 'table2') + + # Now undo the past operation + self.h5file.undo() + + # Check that table2 does not exist in the object tree + self.assertIn("/table", self.h5file) + self.assertNotIn("/agroup2/table2", self.h5file) + table = self.h5file.root.table + self.assertIsNotNone(table.cols.var1.index) + self.assertIsNotNone(table.cols.var2.index) + self.assertIsNotNone(table.cols.var3.index) + self.assertIsNone(table.cols.var4.index) + self.assertEqual(table.cols.var1.index.nelements, minRowIndex) + self.assertEqual(table.cols.var2.index.nelements, minRowIndex) + self.assertEqual(table.cols.var3.index.nelements, minRowIndex) + self.assertEqual(self.h5file.root.table.title, "Indexed") + + # Redo the operation + self.h5file.redo() + + # Check that table2 has come back to life in a sane state + self.assertNotIn("/table", self.h5file) + self.assertIn("/agroup2/table2", self.h5file) + self.assertEqual(self.h5file.root.agroup2.table2.title, "Indexed") + table = self.h5file.root.agroup2.table2 + self.assertIsNotNone(table.cols.var1.index) + self.assertIsNotNone(table.cols.var2.index) + self.assertIsNotNone(table.cols.var3.index) + self.assertEqual(table.cols.var1.index.nelements, minRowIndex) + self.assertEqual(table.cols.var2.index.nelements, minRowIndex) + self.assertEqual(table.cols.var3.index.nelements, minRowIndex) + self.assertIsNone(table.cols.var4.index) + + +class RemoveNodeTestCase(common.TempFileMixin, common.PyTablesTestCase): + """Test for remove_node operations""" + + def setUp(self): + super().setUp() + + h5file = self.h5file + root = h5file.root + + # Create an array + h5file.create_array(root, 'array', [1, 2], title="Title example") + + # Create another array object + h5file.create_array(root, 'anarray', [1], "Array title") + + # Create a group object + group = h5file.create_group(root, 'agroup', "Group title") + + # Create a couple of objects there + h5file.create_array(group, 'anarray1', [2], "Array title 1") + h5file.create_array(group, 'anarray2', [2], "Array title 2") + + # Create a lonely group in first level + h5file.create_group(root, 'agroup2', "Group title 2") + + # Create a new group in the second level + h5file.create_group(group, 'agroup3', "Group title 3") + + # Create a table in root + populateTable(self.h5file.root, 'table') + + def test00(self): + """Checking remove_node (over Leaf)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test00..." % self.__class__.__name__) + + # open the do/undo + self.h5file.enable_undo() + + # Delete an existing array + self.h5file.remove_node('/anarray') + + # Now undo the past operation + self.h5file.undo() + + # Check that it does exist in the object tree + self.assertIn("/anarray", self.h5file) + self.assertEqual(self.h5file.root.anarray.title, "Array title") + + # Redo the operation + self.h5file.redo() + + # Check that array has gone again + self.assertNotIn("/anarray", self.h5file) + + def test00b(self): + """Checking remove_node (over several Leaves)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test00b..." % self.__class__.__name__) + + # open the do/undo + self.h5file.enable_undo() + + # Delete a couple of arrays + self.h5file.remove_node('/anarray') + self.h5file.remove_node('/agroup/anarray2') + + # Now undo the past operation + self.h5file.undo() + + # Check that arrays has come into life + self.assertIn("/anarray", self.h5file) + self.assertIn("/agroup/anarray2", self.h5file) + self.assertEqual(self.h5file.root.anarray.title, "Array title") + self.assertEqual( + self.h5file.root.agroup.anarray2.title, "Array title 2") + + # Redo the operation + self.h5file.redo() + + # Check that arrays has disappeared again + self.assertNotIn("/anarray", self.h5file) + self.assertNotIn("/agroup/anarray2", self.h5file) + + def test00c(self): + """Checking remove_node (over Tables)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test00c..." % self.__class__.__name__) + + # open the do/undo + self.h5file.enable_undo() + + # Create a new array + self.h5file.remove_node('/table') + + # Now undo the past operation + self.h5file.undo() + + # Check that table2 does not exist in the object tree + self.assertIn("/table", self.h5file) + table = self.h5file.root.table + self.assertIsNotNone(table.cols.var1.index) + self.assertIsNotNone(table.cols.var2.index) + self.assertIsNotNone(table.cols.var3.index) + self.assertIsNone(table.cols.var4.index) + self.assertEqual(table.cols.var1.index.nelements, minRowIndex) + self.assertEqual(table.cols.var2.index.nelements, minRowIndex) + self.assertEqual(table.cols.var3.index.nelements, minRowIndex) + self.assertEqual(self.h5file.root.table.title, "Indexed") + + # Redo the operation + self.h5file.redo() + + # Check that table2 has come back to life in a sane state + self.assertNotIn("/table", self.h5file) + + def test01(self): + """Checking remove_node (over Groups with children)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test01..." % self.__class__.__name__) + + # open the do/undo + self.h5file.enable_undo() + + # Delete a group recursively + self.h5file.remove_node('/agroup', recursive=1) + + # Now undo the past operation + self.h5file.undo() + + # Check that parent and children has come into life in a sane state + self.assertIn("/agroup", self.h5file) + self.assertIn("/agroup/anarray1", self.h5file) + self.assertIn("/agroup/anarray2", self.h5file) + self.assertIn("/agroup/agroup3", self.h5file) + self.assertEqual(self.h5file.root.agroup._v_title, "Group title") + + # Redo the operation + self.h5file.redo() + + # Check that parent and children are not reachable + self.assertNotIn("/agroup", self.h5file) + self.assertNotIn("/agroup/anarray1", self.h5file) + self.assertNotIn("/agroup/anarray2", self.h5file) + self.assertNotIn("/agroup/agroup3", self.h5file) + + def test01b(self): + """Checking remove_node (over Groups with children 2)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test01b..." % self.__class__.__name__) + + # open the do/undo + self.h5file.enable_undo() + + # Remove a couple of groups + self.h5file.remove_node('/agroup', recursive=1) + self.h5file.remove_node('/agroup2') + + # Now undo the past operation + self.h5file.undo() + + # Check that they does exist in the object tree + self.assertIn("/agroup", self.h5file) + self.assertIn("/agroup2", self.h5file) + + # Check that children are reachable + self.assertIn("/agroup/anarray1", self.h5file) + self.assertIn("/agroup/anarray2", self.h5file) + self.assertIn("/agroup/agroup3", self.h5file) + self.assertEqual(self.h5file.root.agroup._v_title, "Group title") + + # Redo the operation + self.h5file.redo() + + # Check that groups does not exist again + self.assertNotIn("/agroup", self.h5file) + self.assertNotIn("/agroup2", self.h5file) + + # Check that children are not reachable + self.assertNotIn("/agroup/anarray1", self.h5file) + self.assertNotIn("/agroup/anarray2", self.h5file) + self.assertNotIn("/agroup/agroup3", self.h5file) + + +class CopyNodeTestCase(common.TempFileMixin, common.PyTablesTestCase): + """Tests for copy_node and copy_children operations""" + + def setUp(self): + super().setUp() + + h5file = self.h5file + root = h5file.root + + # Create an array + h5file.create_array(root, 'array', [1, 2], title="Title example") + + # Create another array object + h5file.create_array(root, 'anarray', [1], "Array title") + + # Create a group object + group = h5file.create_group(root, 'agroup', "Group title") + + # Create a couple of objects there + h5file.create_array(group, 'anarray1', [2], "Array title 1") + h5file.create_array(group, 'anarray2', [2], "Array title 2") + + # Create a lonely group in first level + h5file.create_group(root, 'agroup2', "Group title 2") + + # Create a new group in the second level + h5file.create_group(group, 'agroup3', "Group title 3") + + # Create a table in root + populateTable(self.h5file.root, 'table') + + def test00_copyLeaf(self): + """Checking copy_node (over Leaves)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test00_copyLeaf..." % self.__class__.__name__) + + # Enable undo/redo. + self.h5file.enable_undo() + + # /anarray => /agroup/agroup3/ + new_node = self.h5file.copy_node('/anarray', '/agroup/agroup3') + + # Undo the copy. + self.h5file.undo() + + # Check that the copied node does not exist in the object tree. + self.assertNotIn('/agroup/agroup3/anarray', self.h5file) + + # Redo the copy. + self.h5file.redo() + + # Check that the copied node exists again in the object tree. + self.assertIn('/agroup/agroup3/anarray', self.h5file) + self.assertIs(self.h5file.root.agroup.agroup3.anarray, new_node) + + def test00b_copyTable(self): + """Checking copy_node (over Tables)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test00b_copyTable..." % self.__class__.__name__) + + # open the do/undo + self.h5file.enable_undo() + + # /table => /agroup/agroup3/ + warnings.filterwarnings("ignore", category=UserWarning) + table = self.h5file.copy_node( + '/table', '/agroup/agroup3', propindexes=True) + warnings.filterwarnings("default", category=UserWarning) + self.assertIn("/agroup/agroup3/table", self.h5file) + + table = self.h5file.root.agroup.agroup3.table + self.assertEqual(table.title, "Indexed") + self.assertIsNotNone(table.cols.var1.index) + self.assertIsNotNone(table.cols.var2.index) + self.assertIsNotNone(table.cols.var3.index) + self.assertEqual(table.cols.var1.index.nelements, minRowIndex) + self.assertEqual(table.cols.var2.index.nelements, minRowIndex) + self.assertEqual(table.cols.var3.index.nelements, minRowIndex) + self.assertIsNone(table.cols.var4.index) + + # Now undo the past operation + self.h5file.undo() + table = self.h5file.root.table + self.assertIsNotNone(table.cols.var1.index) + self.assertIsNotNone(table.cols.var2.index) + self.assertIsNotNone(table.cols.var3.index) + self.assertIsNone(table.cols.var4.index) + self.assertEqual(table.cols.var1.index.nelements, minRowIndex) + self.assertEqual(table.cols.var2.index.nelements, minRowIndex) + self.assertEqual(table.cols.var3.index.nelements, minRowIndex) + + # Check that the copied node does not exist in the object tree. + self.assertNotIn("/agroup/agroup3/table", self.h5file) + + # Redo the operation + self.h5file.redo() + + # Check that table has come back to life in a sane state + self.assertIn("/table", self.h5file) + self.assertIn("/agroup/agroup3/table", self.h5file) + table = self.h5file.root.agroup.agroup3.table + self.assertEqual(table.title, "Indexed") + self.assertIsNotNone(table.cols.var1.index) + self.assertIsNotNone(table.cols.var2.index) + self.assertIsNotNone(table.cols.var3.index) + self.assertEqual(table.cols.var1.index.nelements, minRowIndex) + self.assertEqual(table.cols.var2.index.nelements, minRowIndex) + self.assertEqual(table.cols.var3.index.nelements, minRowIndex) + self.assertIsNone(table.cols.var4.index) + + def test01_copyGroup(self): + """Copying a group (recursively).""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test01_copyGroup..." % self.__class__.__name__) + + # Enable undo/redo. + self.h5file.enable_undo() + + # /agroup => /acopy + new_node = self.h5file.copy_node( + '/agroup', newname='acopy', recursive=True) + + # Undo the copy. + self.h5file.undo() + + # Check that the copied node does not exist in the object tree. + self.assertNotIn('/acopy', self.h5file) + self.assertNotIn('/acopy/anarray1', self.h5file) + self.assertNotIn('/acopy/anarray2', self.h5file) + self.assertNotIn('/acopy/agroup3', self.h5file) + + # Redo the copy. + self.h5file.redo() + + # Check that the copied node exists again in the object tree. + self.assertIn('/acopy', self.h5file) + self.assertIn('/acopy/anarray1', self.h5file) + self.assertIn('/acopy/anarray2', self.h5file) + self.assertIn('/acopy/agroup3', self.h5file) + self.assertIs(self.h5file.root.acopy, new_node) + + def test02_copyLeafOverwrite(self): + """Copying a leaf, overwriting destination.""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test02_copyLeafOverwrite..." % + self.__class__.__name__) + + # Enable undo/redo. + self.h5file.enable_undo() + + # /anarray => /agroup/agroup + oldNode = self.h5file.root.agroup + new_node = self.h5file.copy_node( + '/anarray', newname='agroup', overwrite=True) + + # Undo the copy. + self.h5file.undo() + + # Check that the copied node does not exist in the object tree. + # Check that the overwritten node exists again in the object tree. + self.assertIs(self.h5file.root.agroup, oldNode) + + # Redo the copy. + self.h5file.redo() + + # Check that the copied node exists again in the object tree. + # Check that the overwritten node does not exist in the object tree. + self.assertIs(self.h5file.root.agroup, new_node) + + def test03_copyChildren(self): + """Copying the children of a group""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test03_copyChildren..." % + self.__class__.__name__) + + # Enable undo/redo. + self.h5file.enable_undo() + + # /agroup/* => /agroup/ + self.h5file.copy_children('/agroup', '/agroup2', recursive=True) + + # Undo the copy. + self.h5file.undo() + + # Check that the copied nodes do not exist in the object tree. + self.assertNotIn('/agroup2/anarray1', self.h5file) + self.assertNotIn('/agroup2/anarray2', self.h5file) + self.assertNotIn('/agroup2/agroup3', self.h5file) + + # Redo the copy. + self.h5file.redo() + + # Check that the copied nodes exist again in the object tree. + self.assertIn('/agroup2/anarray1', self.h5file) + self.assertIn('/agroup2/anarray2', self.h5file) + self.assertIn('/agroup2/agroup3', self.h5file) + + +class ComplexTestCase(common.TempFileMixin, common.PyTablesTestCase): + """Tests for a mix of all operations""" + + def setUp(self): + super().setUp() + + h5file = self.h5file + root = h5file.root + + # Create an array + h5file.create_array(root, 'array', [1, 2], title="Title example") + + # Create another array object + h5file.create_array(root, 'anarray', [1], "Array title") + + # Create a group object + group = h5file.create_group(root, 'agroup', "Group title") + + # Create a couple of objects there + h5file.create_array(group, 'anarray1', [2], "Array title 1") + h5file.create_array(group, 'anarray2', [2], "Array title 2") + + # Create a lonely group in first level + h5file.create_group(root, 'agroup2', "Group title 2") + + # Create a new group in the second level + h5file.create_group(group, 'agroup3', "Group title 3") + + def test00(self): + """Mix of create_array, create_group, renameNone, move_node, + remove_node, copy_node and copy_children.""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test00..." % self.__class__.__name__) + + # Enable undo/redo. + self.h5file.enable_undo() + + # Create an array + self.h5file.create_array(self.h5file.root, 'anarray3', + [1], "Array title 3") + # Create a group + self.h5file.create_group(self.h5file.root, 'agroup3', "Group title 3") + + # /anarray => /agroup/agroup3/ + new_node = self.h5file.copy_node('/anarray3', '/agroup/agroup3') + new_node = self.h5file.copy_children( + '/agroup', '/agroup3', recursive=1) + + # rename anarray + self.h5file.rename_node('/anarray', 'anarray4') + + # Move anarray + new_node = self.h5file.copy_node('/anarray3', '/agroup') + + # Remove anarray4 + self.h5file.remove_node('/anarray4') + + # Undo the actions + self.h5file.undo() + self.assertNotIn('/anarray4', self.h5file) + self.assertNotIn('/anarray3', self.h5file) + self.assertNotIn('/agroup/agroup3/anarray3', self.h5file) + self.assertNotIn('/agroup3', self.h5file) + self.assertNotIn('/anarray4', self.h5file) + self.assertIn('/anarray', self.h5file) + + # Redo the actions + self.h5file.redo() + + # Check that the copied node exists again in the object tree. + self.assertIn('/agroup/agroup3/anarray3', self.h5file) + self.assertIn('/agroup/anarray3', self.h5file) + self.assertIn('/agroup3/agroup3/anarray3', self.h5file) + self.assertNotIn('/agroup3/anarray3', self.h5file) + self.assertIs(self.h5file.root.agroup.anarray3, new_node) + self.assertNotIn('/anarray', self.h5file) + self.assertNotIn('/anarray4', self.h5file) + + def test01(self): + """Test with multiple generations (Leaf case)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test01..." % self.__class__.__name__) + + # Enable undo/redo. + self.h5file.enable_undo() + + # remove /anarray + self.h5file.remove_node('/anarray') + + # Create an array in the same place + self.h5file.create_array(self.h5file.root, 'anarray', + [2], "Array title 2") + # remove the array again + self.h5file.remove_node('/anarray') + + # Create an array + self.h5file.create_array(self.h5file.root, 'anarray', + [3], "Array title 3") + # remove the array again + self.h5file.remove_node('/anarray') + + # Create an array + self.h5file.create_array(self.h5file.root, 'anarray', + [4], "Array title 4") + # Undo the actions + self.h5file.undo() + + # Check that /anarray is in the correct state before redoing + self.assertEqual(self.h5file.root.anarray.title, "Array title") + self.assertEqual(self.h5file.root.anarray[:], [1]) + + # Redo the actions + self.h5file.redo() + self.assertEqual(self.h5file.root.anarray.title, "Array title 4") + self.assertEqual(self.h5file.root.anarray[:], [4]) + + def test02(self): + """Test with multiple generations (Group case)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test02..." % self.__class__.__name__) + + # Enable undo/redo. + self.h5file.enable_undo() + + # remove /agroup + self.h5file.remove_node('/agroup2') + + # Create a group in the same place + self.h5file.create_group(self.h5file.root, 'agroup2', "Group title 22") + + # remove the group + self.h5file.remove_node('/agroup2') + + # Create a group + self.h5file.create_group(self.h5file.root, 'agroup2', "Group title 3") + + # remove the group + self.h5file.remove_node('/agroup2') + + # Create a group + self.h5file.create_group(self.h5file.root, 'agroup2', "Group title 4") + + # Create a child group + self.h5file.create_group(self.h5file.root.agroup2, 'agroup5', + "Group title 5") + + # Undo the actions + self.h5file.undo() + + # Check that /agroup is in the state before enabling do/undo + self.assertEqual(self.h5file.root.agroup2._v_title, "Group title 2") + self.assertIn('/agroup2', self.h5file) + + # Redo the actions + self.h5file.redo() + self.assertEqual(self.h5file.root.agroup2._v_title, "Group title 4") + self.assertEqual(self.h5file.root.agroup2.agroup5._v_title, + "Group title 5") + + def test03(self): + """Test with multiple generations (Group case, recursive remove)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test03..." % self.__class__.__name__) + + # Enable undo/redo. + self.h5file.enable_undo() + + # remove /agroup + self.h5file.remove_node('/agroup', recursive=1) + + # Create a group in the same place + self.h5file.create_group(self.h5file.root, 'agroup', "Group title 2") + + # remove the group + self.h5file.remove_node('/agroup') + + # Create a group + self.h5file.create_group(self.h5file.root, 'agroup', "Group title 3") + + # remove the group + self.h5file.remove_node('/agroup') + + # Create a group + self.h5file.create_group(self.h5file.root, 'agroup', "Group title 4") + + # Create a child group + self.h5file.create_group(self.h5file.root.agroup, 'agroup5', + "Group title 5") + # Undo the actions + self.h5file.undo() + + # Check that /agroup is in the state before enabling do/undo + self.assertIn('/agroup', self.h5file) + self.assertEqual(self.h5file.root.agroup._v_title, "Group title") + self.assertIn('/agroup/anarray1', self.h5file) + self.assertIn('/agroup/anarray2', self.h5file) + self.assertIn('/agroup/agroup3', self.h5file) + self.assertNotIn('/agroup/agroup5', self.h5file) + + # Redo the actions + self.h5file.redo() + self.assertIn('/agroup', self.h5file) + self.assertEqual(self.h5file.root.agroup._v_title, "Group title 4") + self.assertIn('/agroup/agroup5', self.h5file) + self.assertEqual( + self.h5file.root.agroup.agroup5._v_title, "Group title 5") + + def test03b(self): + """Test with multiple generations (Group case, recursive remove, + case 2)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test03b..." % self.__class__.__name__) + + # Enable undo/redo. + self.h5file.enable_undo() + + # Create a new group with a child + self.h5file.create_group(self.h5file.root, 'agroup3', "Group title 3") + self.h5file.create_group(self.h5file.root.agroup3, 'agroup4', + "Group title 4") + + # remove /agroup3 + self.h5file.remove_node('/agroup3', recursive=1) + + # Create a group in the same place + self.h5file.create_group(self.h5file.root, 'agroup3', "Group title 4") + + # Undo the actions + self.h5file.undo() + + # Check that /agroup is in the state before enabling do/undo + self.assertNotIn('/agroup3', self.h5file) + + # Redo the actions + self.h5file.redo() + self.assertEqual(self.h5file.root.agroup3._v_title, "Group title 4") + self.assertIn('/agroup3', self.h5file) + self.assertNotIn('/agroup/agroup4', self.h5file) + + +class AttributesTestCase(common.TempFileMixin, common.PyTablesTestCase): + """Tests for operation on attributes""" + + def setUp(self): + super().setUp() + + # Create an array. + array = self.h5file.create_array('/', 'array', [1, 2]) + + # Set some attributes on it. + attrs = array.attrs + attrs.attr_1 = 10 + attrs.attr_2 = 20 + attrs.attr_3 = 30 + + def test00_setAttr(self): + """Setting a nonexistent attribute""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test00_setAttr..." % self.__class__.__name__) + + array = self.h5file.root.array + attrs = array.attrs + + self.h5file.enable_undo() + setattr(attrs, 'attr_0', 0) + self.assertIn('attr_0', attrs) + self.assertEqual(attrs.attr_0, 0) + self.h5file.undo() + self.assertNotIn('attr_0', attrs) + self.h5file.redo() + self.assertIn('attr_0', attrs) + self.assertEqual(attrs.attr_0, 0) + + def test01_setAttrExisting(self): + """Setting an existing attribute""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test01_setAttrExisting..." % + self.__class__.__name__) + + array = self.h5file.root.array + attrs = array.attrs + + self.h5file.enable_undo() + setattr(attrs, 'attr_1', 11) + self.assertIn('attr_1', attrs) + self.assertEqual(attrs.attr_1, 11) + self.h5file.undo() + self.assertIn('attr_1', attrs) + self.assertEqual(attrs.attr_1, 10) + self.h5file.redo() + self.assertIn('attr_1', attrs) + self.assertEqual(attrs.attr_1, 11) + + def test02_delAttr(self): + """Removing an attribute""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test02_delAttr..." % self.__class__.__name__) + + array = self.h5file.root.array + attrs = array.attrs + + self.h5file.enable_undo() + delattr(attrs, 'attr_1') + self.assertNotIn('attr_1', attrs) + self.h5file.undo() + self.assertIn('attr_1', attrs) + self.assertEqual(attrs.attr_1, 10) + self.h5file.redo() + self.assertNotIn('attr_1', attrs) + + def test03_copyNodeAttrs(self): + """Copying an attribute set""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test03_copyNodeAttrs..." % + self.__class__.__name__) + + rattrs = self.h5file.root._v_attrs + rattrs.attr_0 = 0 + rattrs.attr_1 = 100 + + array = self.h5file.root.array + attrs = array.attrs + + self.h5file.enable_undo() + attrs._f_copy(self.h5file.root) + self.assertEqual(rattrs.attr_0, 0) + self.assertEqual(rattrs.attr_1, 10) + self.assertEqual(rattrs.attr_2, 20) + self.assertEqual(rattrs.attr_3, 30) + self.h5file.undo() + self.assertEqual(rattrs.attr_0, 0) + self.assertEqual(rattrs.attr_1, 100) + self.assertNotIn('attr_2', rattrs) + self.assertNotIn('attr_3', rattrs) + self.h5file.redo() + self.assertEqual(rattrs.attr_0, 0) + self.assertEqual(rattrs.attr_1, 10) + self.assertEqual(rattrs.attr_2, 20) + self.assertEqual(rattrs.attr_3, 30) + + def test04_replaceNode(self): + """Replacing a node with a rewritten attribute""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test04_replaceNode..." % self.__class__.__name__) + + array = self.h5file.root.array + attrs = array.attrs + + self.h5file.enable_undo() + attrs.attr_1 = 11 + self.h5file.remove_node('/array') + arr = self.h5file.create_array('/', 'array', [1]) + arr.attrs.attr_1 = 12 + self.h5file.undo() + self.assertIn('attr_1', self.h5file.root.array.attrs) + self.assertEqual(self.h5file.root.array.attrs.attr_1, 10) + self.h5file.redo() + self.assertIn('attr_1', self.h5file.root.array.attrs) + self.assertEqual(self.h5file.root.array.attrs.attr_1, 12) + + +class NotLoggedTestCase(common.TempFileMixin, common.PyTablesTestCase): + """Test not logged nodes.""" + + class NotLoggedArray(tb.node.NotLoggedMixin, tb.Array): + pass + + def test00_hierarchy(self): + """Performing hierarchy operations on a not logged node.""" + + self.h5file.create_group('/', 'tgroup') + self.h5file.enable_undo() + + # Node creation is not undone. + arr = self.NotLoggedArray(self.h5file.root, 'test', + [1], self._getMethodName()) + self.h5file.undo() + self.assertIn('/test', self.h5file) + + # Node movement is not undone. + arr.move('/tgroup') + self.h5file.undo() + self.assertIn('/tgroup/test', self.h5file) + + # Node removal is not undone. + arr.remove() + self.h5file.undo() + self.assertNotIn('/tgroup/test', self.h5file) + + def test01_attributes(self): + """Performing attribute operations on a not logged node.""" + + arr = self.NotLoggedArray(self.h5file.root, 'test', + [1], self._getMethodName()) + self.h5file.enable_undo() + + # Attribute creation is not undone. + arr._v_attrs.foo = 'bar' + self.h5file.undo() + self.assertEqual(arr._v_attrs.foo, 'bar') + + # Attribute change is not undone. + arr._v_attrs.foo = 'baz' + self.h5file.undo() + self.assertEqual(arr._v_attrs.foo, 'baz') + + # Attribute removal is not undone. + del arr._v_attrs.foo + self.h5file.undo() + self.assertRaises(AttributeError, getattr, arr._v_attrs, 'foo') + + +class CreateParentsTestCase(common.TempFileMixin, common.PyTablesTestCase): + """Test the ``createparents`` flag.""" + + def setUp(self): + super().setUp() + g1 = self.h5file.create_group('/', 'g1') + self.h5file.create_group(g1, 'g2') + + def existing(self, paths): + """Return a set of the existing paths in `paths`.""" + return frozenset(path for path in paths if path in self.h5file) + + def basetest(self, doit, pre, post): + pre() + self.h5file.enable_undo() + + paths = ['/g1', '/g1/g2', '/g1/g2/g3', '/g1/g2/g3/g4'] + for newpath in paths: + before = self.existing(paths) + doit(newpath) + after = self.existing(paths) + self.assertTrue(after.issuperset(before)) + + self.h5file.undo() + post(newpath) + after = self.existing(paths) + self.assertEqual(after, before) + + def test00_create(self): + """Test creating a node.""" + + def pre(): + pass + + def doit(newpath): + self.h5file.create_array(newpath, 'array', [1], createparents=True) + self.assertIn(tb.path.join_path(newpath, 'array'), self.h5file) + + def post(newpath): + self.assertNotIn(tb.path.join_path(newpath, 'array'), self.h5file) + self.basetest(doit, pre, post) + + def test01_move(self): + """Test moving a node.""" + + def pre(): + self.h5file.create_array('/', 'array', [1]) + + def doit(newpath): + self.h5file.move_node('/array', newpath, createparents=True) + self.assertNotIn('/array', self.h5file) + self.assertIn(tb.path.join_path(newpath, 'array'), self.h5file) + + def post(newpath): + self.assertIn('/array', self.h5file) + self.assertNotIn(tb.path.join_path(newpath, 'array'), self.h5file) + self.basetest(doit, pre, post) + + def test02_copy(self): + """Test copying a node.""" + + def pre(): + self.h5file.create_array('/', 'array', [1]) + + def doit(newpath): + self.h5file.copy_node('/array', newpath, createparents=True) + self.assertIn(tb.path.join_path(newpath, 'array'), self.h5file) + + def post(newpath): + self.assertNotIn(tb.path.join_path(newpath, 'array'), self.h5file) + self.basetest(doit, pre, post) + + def test03_copyChildren(self): + """Test copying the children of a group.""" + + def pre(): + g = self.h5file.create_group('/', 'group') + self.h5file.create_array(g, 'array1', [1]) + self.h5file.create_array(g, 'array2', [1]) + + def doit(newpath): + self.h5file.copy_children('/group', newpath, createparents=True) + self.assertIn(tb.path.join_path(newpath, 'array1'), self.h5file) + self.assertIn(tb.path.join_path(newpath, 'array2'), self.h5file) + + def post(newpath): + self.assertNotIn(tb.path.join_path(newpath, 'array1'), self.h5file) + self.assertNotIn(tb.path.join_path(newpath, 'array2'), self.h5file) + self.basetest(doit, pre, post) + + +def suite(): + theSuite = common.unittest.TestSuite() + niter = 1 + # common.heavy = 1 # uncomment this only for testing purposes + + for n in range(niter): + theSuite.addTest(common.unittest.makeSuite(BasicTestCase)) + theSuite.addTest(common.unittest.makeSuite(PersistenceTestCase)) + theSuite.addTest(common.unittest.makeSuite(CreateArrayTestCase)) + theSuite.addTest(common.unittest.makeSuite(CreateGroupTestCase)) + theSuite.addTest(common.unittest.makeSuite(RenameNodeTestCase)) + theSuite.addTest(common.unittest.makeSuite(MoveNodeTestCase)) + theSuite.addTest(common.unittest.makeSuite(RemoveNodeTestCase)) + theSuite.addTest(common.unittest.makeSuite(CopyNodeTestCase)) + theSuite.addTest(common.unittest.makeSuite(AttributesTestCase)) + theSuite.addTest(common.unittest.makeSuite(ComplexTestCase)) + theSuite.addTest(common.unittest.makeSuite(NotLoggedTestCase)) + theSuite.addTest(common.unittest.makeSuite(CreateParentsTestCase)) + if common.heavy: + pass + + return theSuite + + +if __name__ == '__main__': + import sys + common.parse_argv(sys.argv) + common.print_versions() + common.unittest.main(defaultTest='suite') diff --git a/tables/tests/test_earray.py b/tables/tests/test_earray.py new file mode 100644 index 0000000..8e400e2 --- /dev/null +++ b/tables/tests/test_earray.py @@ -0,0 +1,2890 @@ +import sys +from pathlib import Path + +import numpy as np + +import tables as tb +from tables.tests import common + + +class BasicTestCase(common.TempFileMixin, common.PyTablesTestCase): + # Default values + obj = None + flavor = "numpy" + type = 'int32' + dtype = 'int32' + shape = (2, 0) + start = 0 + stop = 10 + step = 1 + length = 1 + chunksize = 5 + nappends = 10 + compress = 0 + complib = "zlib" # Default compression library + shuffle = 0 + fletcher32 = 0 + reopen = 1 # Tells whether the file has to be reopened on each test or not + + def setUp(self): + super().setUp() + + # Create an instance of an HDF5 Table + self.rootgroup = self.h5file.root + self.populateFile() + if self.reopen: + # Close the file + self.h5file.close() + + def populateFile(self): + group = self.rootgroup + obj = self.obj + if obj is None: + if self.type == "string": + atom = tb.StringAtom(itemsize=self.length) + else: + atom = tb.Atom.from_type(self.type) + else: + atom = None + title = self.__class__.__name__ + filters = tb.Filters(complevel=self.compress, + complib=self.complib, + shuffle=self.shuffle, + fletcher32=self.fletcher32) + earray = self.h5file.create_earray(group, 'earray1', + atom=atom, shape=self.shape, + title=title, filters=filters, + expectedrows=1, obj=obj) + earray.flavor = self.flavor + + # Fill it with rows + self.rowshape = list(earray.shape) + if obj is not None: + self.rowshape[0] = 0 + self.objsize = self.length + for i in self.rowshape: + if i != 0: + self.objsize *= i + self.extdim = earray.extdim + self.objsize *= self.chunksize + self.rowshape[earray.extdim] = self.chunksize + + if self.type == "string": + object = np.ndarray(buffer=b"a"*self.objsize, + shape=self.rowshape, + dtype="S%s" % earray.atom.itemsize) + else: + object = np.arange(self.objsize, dtype=earray.atom.dtype.base) + object.shape = self.rowshape + + if common.verbose: + if self.flavor == "numpy": + print("Object to append -->", object) + else: + print("Object to append -->", repr(object)) + for i in range(self.nappends): + if self.type == "string": + earray.append(object) + else: + earray.append(object * i) + + def _get_shape(self): + if self.shape is not None: + shape = self.shape + else: + shape = np.asarray(self.obj).shape + + return shape + + def test00_attributes(self): + if self.reopen: + self._reopen() + obj = self.h5file.get_node("/earray1") + + shape = self._get_shape() + shape = list(shape) + shape[self.extdim] = self.chunksize * self.nappends + if self.obj is not None: + shape[self.extdim] += len(self.obj) + shape = tuple(shape) + + self.assertEqual(obj.flavor, self.flavor) + self.assertEqual(obj.shape, shape) + self.assertEqual(obj.ndim, len(shape)) + self.assertEqual(obj.nrows, shape[self.extdim]) + self.assertEqual(obj.atom.type, self.type) + + def test01_iterEArray(self): + """Checking enlargeable array iterator.""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test01_iterEArray..." % self.__class__.__name__) + + # Create an instance of an HDF5 Table + if self.reopen: + self._reopen() + earray = self.h5file.get_node("/earray1") + + # Choose a small value for buffer size + earray.nrowsinbuf = 3 + if common.verbose: + print("EArray descr:", repr(earray)) + print("shape of read array ==>", earray.shape) + print("reopening?:", self.reopen) + + # Build the array to do comparisons + if self.type == "string": + object_ = np.ndarray(buffer=b"a"*self.objsize, + shape=self.rowshape, + dtype="S%s" % earray.atom.itemsize) + else: + object_ = np.arange(self.objsize, dtype=earray.atom.dtype.base) + object_.shape = self.rowshape + object_ = object_.swapaxes(earray.extdim, 0) + + if self.obj is not None: + initialrows = len(self.obj) + else: + initialrows = 0 + + shape = self._get_shape() + + # Read all the array + for idx, row in enumerate(earray): + if idx < initialrows: + self.assertTrue(common.allequal( + row, np.asarray(self.obj[idx]), self.flavor)) + continue + + chunk = int((earray.nrow - initialrows) % self.chunksize) + if chunk == 0: + if self.type == "string": + object__ = object_ + else: + i = int(earray.nrow - initialrows) + object__ = object_ * (i // self.chunksize) + + object = object__[chunk] + # The next adds much more verbosity + if common.verbose and 0: + print("number of row ==>", earray.nrow) + if hasattr(object, "shape"): + print("shape should look as:", object.shape) + print("row in earray ==>", repr(row)) + print("Should look like ==>", repr(object)) + + self.assertEqual(initialrows + self.nappends * self.chunksize, + earray.nrows) + self.assertTrue(common.allequal(row, object, self.flavor)) + if hasattr(row, "shape"): + self.assertEqual(len(row.shape), len(shape) - 1) + else: + # Scalar case + self.assertEqual(len(shape), 1) + + # Check filters: + if self.compress != earray.filters.complevel and common.verbose: + print("Error in compress. Class:", self.__class__.__name__) + print("self, earray:", self.compress, earray.filters.complevel) + self.assertEqual(earray.filters.complevel, self.compress) + if self.compress > 0 and tb.which_lib_version(self.complib): + self.assertEqual(earray.filters.complib, self.complib) + if self.shuffle != earray.filters.shuffle and common.verbose: + print("Error in shuffle. Class:", self.__class__.__name__) + print("self, earray:", self.shuffle, earray.filters.shuffle) + self.assertEqual(self.shuffle, earray.filters.shuffle) + if self.fletcher32 != earray.filters.fletcher32 and common.verbose: + print("Error in fletcher32. Class:", self.__class__.__name__) + print("self, earray:", self.fletcher32, + earray.filters.fletcher32) + self.assertEqual(self.fletcher32, earray.filters.fletcher32) + + def test02_sssEArray(self): + """Checking enlargeable array iterator with (start, stop, step)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test02_sssEArray..." % self.__class__.__name__) + + # Create an instance of an HDF5 Table + if self.reopen: + self._reopen() + earray = self.h5file.get_node("/earray1") + + # Choose a small value for buffer size + earray.nrowsinbuf = 3 + if common.verbose: + print("EArray descr:", repr(earray)) + print("shape of read array ==>", earray.shape) + print("reopening?:", self.reopen) + + # Build the array to do comparisons + if self.type == "string": + object_ = np.ndarray(buffer=b"a"*self.objsize, + shape=self.rowshape, + dtype="S%s" % earray.atom.itemsize) + else: + object_ = np.arange(self.objsize, dtype=earray.atom.dtype.base) + object_.shape = self.rowshape + object_ = object_.swapaxes(earray.extdim, 0) + + if self.obj is not None: + initialrows = len(self.obj) + else: + initialrows = 0 + + shape = self._get_shape() + + # Read all the array + for idx, row in enumerate(earray.iterrows(start=self.start, + stop=self.stop, + step=self.step)): + if idx < initialrows: + self.assertTrue(common.allequal( + row, np.asarray(self.obj[idx]), self.flavor)) + continue + + if self.chunksize == 1: + index = 0 + else: + index = int((earray.nrow - initialrows) % self.chunksize) + + if self.type == "string": + object__ = object_ + else: + i = int(earray.nrow - initialrows) + object__ = object_ * (i // self.chunksize) + object = object__[index] + + # The next adds much more verbosity + if common.verbose and 0: + print("number of row ==>", earray.nrow) + if hasattr(object, "shape"): + print("shape should look as:", object.shape) + print("row in earray ==>", repr(row)) + print("Should look like ==>", repr(object)) + + self.assertEqual(initialrows + self.nappends * self.chunksize, + earray.nrows) + self.assertTrue(common.allequal(row, object, self.flavor)) + if hasattr(row, "shape"): + self.assertEqual(len(row.shape), len(shape) - 1) + else: + # Scalar case + self.assertEqual(len(shape), 1) + + def test03_readEArray(self): + """Checking read() of enlargeable arrays.""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test03_readEArray..." % self.__class__.__name__) + + # This conversion made just in case indices are numpy scalars + if self.start is not None: + self.start = int(self.start) + if self.stop is not None: + self.stop = int(self.stop) + if self.step is not None: + self.step = int(self.step) + + # Create an instance of an HDF5 Table + if self.reopen: + self._reopen() + earray = self.h5file.get_node("/earray1") + + # Choose a small value for buffer size + earray.nrowsinbuf = 3 + if common.verbose: + print("EArray descr:", repr(earray)) + print("shape of read array ==>", earray.shape) + print("reopening?:", self.reopen) + + # Build the array to do comparisons + if self.type == "string": + object_ = np.ndarray(buffer=b"a"*self.objsize, + shape=self.rowshape, + dtype="S%s" % earray.atom.itemsize) + else: + object_ = np.arange(self.objsize, dtype=earray.atom.dtype.base) + object_.shape = self.rowshape + object_ = object_.swapaxes(earray.extdim, 0) + + if self.obj is not None: + initialrows = len(self.obj) + else: + initialrows = 0 + + rowshape = self.rowshape + rowshape[self.extdim] *= (self.nappends + initialrows) + if self.type == "string": + object__ = np.empty(shape=rowshape, + dtype=f"S{earray.atom.itemsize}") + else: + object__ = np.empty(shape=rowshape, dtype=self.dtype) + + object__ = object__.swapaxes(0, self.extdim) + + if initialrows: + object__[0:initialrows] = self.obj + + for i in range(self.nappends): + j = initialrows + i * self.chunksize + if self.type == "string": + object__[j:j + self.chunksize] = object_ + else: + object__[j:j + self.chunksize] = object_ * i + + stop = self.stop + + if self.nappends: + # stop == None means read only the element designed by start + # (in read() contexts) + if self.stop is None: + if self.start == -1: # corner case + stop = earray.nrows + else: + stop = self.start + 1 + # Protection against number of elements less than existing + # if rowshape[self.extdim] < self.stop or self.stop == 0: + if rowshape[self.extdim] < stop: + # self.stop == 0 means last row only in read() + # and not in [::] slicing notation + stop = rowshape[self.extdim] + # do a copy() in order to ensure that len(object._data) + # actually do a measure of its length + # object = object__[self.start:stop:self.step].copy() + object = object__[self.start:self.stop:self.step].copy() + # Swap the axes again to have normal ordering + if self.flavor == "numpy": + object = object.swapaxes(0, self.extdim) + else: + object = np.empty(shape=self.shape, dtype=self.dtype) + + # Read all the array + try: + row = earray.read(self.start, self.stop, self.step) + except IndexError: + row = np.empty(shape=self.shape, dtype=self.dtype) + + if common.verbose: + if hasattr(object, "shape"): + print("shape should look as:", object.shape) + print("Object read ==>", repr(row)) + print("Should look like ==>", repr(object)) + + self.assertEqual(initialrows + self.nappends * self.chunksize, + earray.nrows) + self.assertTrue(common.allequal(row, object, self.flavor)) + + shape = self._get_shape() + if hasattr(row, "shape"): + self.assertEqual(len(row.shape), len(shape)) + if self.flavor == "numpy": + self.assertEqual(row.itemsize, earray.atom.itemsize) + else: + # Scalar case + self.assertEqual(len(shape), 1) + + def test03_readEArray_out_argument(self): + """Checking read() of enlargeable arrays.""" + + # This conversion made just in case indices are numpy scalars + if self.start is not None: + self.start = int(self.start) + if self.stop is not None: + self.stop = int(self.stop) + if self.step is not None: + self.step = int(self.step) + + # Create an instance of an HDF5 Table + if self.reopen: + self._reopen() + earray = self.h5file.get_node("/earray1") + + # Choose a small value for buffer size + earray.nrowsinbuf = 3 + # Build the array to do comparisons + if self.type == "string": + object_ = np.ndarray(buffer=b"a"*self.objsize, + shape=self.rowshape, + dtype="S%s" % earray.atom.itemsize) + else: + object_ = np.arange(self.objsize, dtype=earray.atom.dtype.base) + object_.shape = self.rowshape + object_ = object_.swapaxes(earray.extdim, 0) + + if self.obj is not None: + initialrows = len(self.obj) + else: + initialrows = 0 + + rowshape = self.rowshape + rowshape[self.extdim] *= (self.nappends + initialrows) + if self.type == "string": + object__ = np.empty(shape=rowshape, + dtype=f"S{earray.atom.itemsize}") + else: + object__ = np.empty(shape=rowshape, dtype=self.dtype) + + object__ = object__.swapaxes(0, self.extdim) + + if initialrows: + object__[0:initialrows] = self.obj + + for i in range(self.nappends): + j = initialrows + i * self.chunksize + if self.type == "string": + object__[j:j + self.chunksize] = object_ + else: + object__[j:j + self.chunksize] = object_ * i + + stop = self.stop + + if self.nappends: + # stop == None means read only the element designed by start + # (in read() contexts) + if self.stop is None: + if self.start == -1: # corner case + stop = earray.nrows + else: + stop = self.start + 1 + # Protection against number of elements less than existing + # if rowshape[self.extdim] < self.stop or self.stop == 0: + if rowshape[self.extdim] < stop: + # self.stop == 0 means last row only in read() + # and not in [::] slicing notation + stop = rowshape[self.extdim] + # do a copy() in order to ensure that len(object._data) + # actually do a measure of its length + # object = object__[self.start:stop:self.step].copy() + object = object__[self.start:self.stop:self.step].copy() + # Swap the axes again to have normal ordering + if self.flavor == "numpy": + object = object.swapaxes(0, self.extdim) + else: + object = np.empty(shape=self.shape, dtype=self.dtype) + + # Read all the array + try: + row = np.empty(earray.shape, dtype=earray.atom.dtype) + slice_obj = [slice(None)] * len(earray.shape) + # slice_obj[earray.maindim] = slice(self.start, stop, self.step) + slice_obj[earray.maindim] = slice(self.start, self.stop, self.step) + row = row[tuple(slice_obj)].copy() + earray.read(self.start, self.stop, self.step, out=row) + except IndexError: + row = np.empty(shape=self.shape, dtype=self.dtype) + + if common.verbose: + if hasattr(object, "shape"): + print("shape should look as:", object.shape) + print("Object read ==>", repr(row)) + print("Should look like ==>", repr(object)) + + self.assertEqual(initialrows + self.nappends * self.chunksize, + earray.nrows) + self.assertTrue(common.allequal(row, object, self.flavor)) + + shape = self._get_shape() + if hasattr(row, "shape"): + self.assertEqual(len(row.shape), len(shape)) + if self.flavor == "numpy": + self.assertEqual(row.itemsize, earray.atom.itemsize) + else: + # Scalar case + self.assertEqual(len(shape), 1) + + def test04_getitemEArray(self): + """Checking enlargeable array __getitem__ special method.""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test04_getitemEArray..." % + self.__class__.__name__) + + if not hasattr(self, "slices"): + # If there is not a slices attribute, create it + # This conversion made just in case indices are numpy scalars + if self.start is not None: + self.start = int(self.start) + if self.stop is not None: + self.stop = int(self.stop) + if self.step is not None: + self.step = int(self.step) + self.slices = (slice(self.start, self.stop, self.step),) + + # Create an instance of an HDF5 Table + if self.reopen: + self._reopen() + earray = self.h5file.get_node("/earray1") + + # Choose a small value for buffer size + # earray.nrowsinbuf = 3 # this does not really changes the chunksize + if common.verbose: + print("EArray descr:", repr(earray)) + print("shape of read array ==>", earray.shape) + print("reopening?:", self.reopen) + + # Build the array to do comparisons + if self.type == "string": + object_ = np.ndarray(buffer=b"a" * self.objsize, + shape=self.rowshape, + dtype=f"S{earray.atom.itemsize}") + else: + object_ = np.arange(self.objsize, dtype=earray.atom.dtype.base) + object_.shape = self.rowshape + + object_ = object_.swapaxes(earray.extdim, 0) + + if self.obj is not None: + initialrows = len(self.obj) + else: + initialrows = 0 + + rowshape = self.rowshape + rowshape[self.extdim] *= (self.nappends + initialrows) + if self.type == "string": + object__ = np.empty(shape=rowshape, + dtype=f"S{earray.atom.itemsize}") + else: + object__ = np.empty(shape=rowshape, dtype=self.dtype) + # Additional conversion for the numpy case + object__ = object__.swapaxes(0, earray.extdim) + + if initialrows: + object__[0:initialrows] = self.obj + + for i in range(self.nappends): + j = initialrows + i * self.chunksize + if self.type == "string": + object__[j:j + self.chunksize] = object_ + else: + object__[j:j + self.chunksize] = object_ * i + + if self.nappends: + # Swap the axes again to have normal ordering + if self.flavor == "numpy": + object__ = object__.swapaxes(0, self.extdim) + else: + object__.swapaxes(0, self.extdim) + # do a copy() in order to ensure that len(object._data) + # actually do a measure of its length + object = object__.__getitem__(self.slices).copy() + else: + object = np.empty(shape=self.shape, dtype=self.dtype) + + # Read all the array + try: + row = earray.__getitem__(self.slices) + except IndexError: + row = np.empty(shape=self.shape, dtype=self.dtype) + + if common.verbose: + print("Object read:\n", repr(row)) + print("Should look like:\n", repr(object)) + if hasattr(object, "shape"): + print("Original object shape:", self.shape) + print("Shape read:", row.shape) + print("shape should look as:", object.shape) + + self.assertEqual(initialrows + self.nappends * self.chunksize, + earray.nrows) + self.assertTrue(common.allequal(row, object, self.flavor)) + if not hasattr(row, "shape"): + # Scalar case + self.assertEqual(len(self.shape), 1) + + def test05_setitemEArray(self): + """Checking enlargeable array __setitem__ special method.""" + + if self.__class__.__name__ == "Ellipsis6EArrayTestCase": + # We have a problem with test design here, but I think + # it is not worth the effort to solve it + # F.Alted 2004-10-27 + return + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test05_setitemEArray..." % + self.__class__.__name__) + + if not hasattr(self, "slices"): + # If there is not a slices attribute, create it + # This conversion made just in case indices are numpy scalars + if self.start is not None: + self.start = int(self.start) + if self.stop is not None: + self.stop = int(self.stop) + if self.step is not None: + self.step = int(self.step) + self.slices = (slice(self.start, self.stop, self.step),) + + # Create an instance of an HDF5 Table + if self.reopen: + self._reopen(mode="a") + earray = self.h5file.get_node("/earray1") + + # Choose a small value for buffer size + # earray.nrowsinbuf = 3 # this does not really changes the chunksize + if common.verbose: + print("EArray descr:", repr(earray)) + print("shape of read array ==>", earray.shape) + print("reopening?:", self.reopen) + + # Build the array to do comparisons + if self.type == "string": + object_ = np.ndarray(buffer=b"a" * self.objsize, + shape=self.rowshape, + dtype=f"S{earray.atom.itemsize}") + else: + object_ = np.arange(self.objsize, dtype=earray.atom.dtype.base) + object_.shape = self.rowshape + + object_ = object_.swapaxes(earray.extdim, 0) + + if self.obj is not None: + initialrows = len(self.obj) + else: + initialrows = 0 + + rowshape = self.rowshape + rowshape[self.extdim] *= (self.nappends + initialrows) + if self.type == "string": + object__ = np.empty(shape=rowshape, + dtype=f"S{earray.atom.itemsize}") + else: + object__ = np.empty(shape=rowshape, dtype=self.dtype) + # Additional conversion for the numpy case + object__ = object__.swapaxes(0, earray.extdim) + + for i in range(self.nappends): + j = initialrows + i * self.chunksize + if self.type == "string": + object__[j:j + self.chunksize] = object_ + else: + object__[j:j + self.chunksize] = object_ * i + # Modify the earray + # earray[j:j + self.chunksize] = object_ * i + # earray[self.slices] = 1 + + if initialrows: + object__[0:initialrows] = self.obj + + if self.nappends: + # Swap the axes again to have normal ordering + if self.flavor == "numpy": + object__ = object__.swapaxes(0, self.extdim) + else: + object__.swapaxes(0, self.extdim) + # do a copy() in order to ensure that len(object._data) + # actually do a measure of its length + object = object__.__getitem__(self.slices).copy() + else: + object = np.empty(shape=self.shape, dtype=self.dtype) + + if self.flavor == "numpy": + object = np.asarray(object) + + if self.type == "string": + if hasattr(self, "wslice"): + # These sentences should be equivalent + # object[self.wslize] = object[self.wslice].pad("xXx") + # earray[self.wslice] = earray[self.wslice].pad("xXx") + object[self.wslize] = "xXx" + earray[self.wslice] = "xXx" + elif sum(object[self.slices].shape) != 0: + # object[:] = object.pad("xXx") + object[:] = "xXx" + if object.size > 0: + earray[self.slices] = object + else: + if hasattr(self, "wslice"): + object[self.wslice] = object[self.wslice] * 2 + 3 + earray[self.wslice] = earray[self.wslice] * 2 + 3 + elif sum(object[self.slices].shape) != 0: + object = object * 2 + 3 + if np.prod(object.shape) > 0: + earray[self.slices] = earray[self.slices] * 2 + 3 + # Read all the array + row = earray.__getitem__(self.slices) + try: + row = earray.__getitem__(self.slices) + except IndexError: + print("IndexError!") + row = np.empty(shape=self.shape, dtype=self.dtype) + + if common.verbose: + print("Object read:\n", repr(row)) + print("Should look like:\n", repr(object)) + if hasattr(object, "shape"): + print("Original object shape:", self.shape) + print("Shape read:", row.shape) + print("shape should look as:", object.shape) + + self.assertEqual(initialrows + self.nappends * self.chunksize, + earray.nrows) + self.assertTrue(common.allequal(row, object, self.flavor)) + if not hasattr(row, "shape"): + # Scalar case + self.assertEqual(len(self.shape), 1) + + +class BasicWriteTestCase(BasicTestCase): + type = 'int32' + shape = (0,) + chunksize = 5 + nappends = 10 + step = 1 + # wslice = slice(1,nappends,2) + wslice = 1 # single element case + + +class Basic2WriteTestCase(BasicTestCase): + type = 'int32' + dtype = 'i4' + shape = (0,) + chunksize = 5 + nappends = 10 + step = 1 + wslice = slice(chunksize-2, nappends, 2) # range of elements + reopen = 0 # This case does not reopen files + + +class Basic3WriteTestCase(BasicTestCase): + obj = [1, 2] + type = np.asarray(obj).dtype.name + dtype = np.asarray(obj).dtype.str + shape = (0,) + chunkshape = (5,) + step = 1 + reopen = 0 # This case does not reopen files + + +class Basic4WriteTestCase(BasicTestCase): + obj = np.array([1, 2]) + type = obj.dtype.name + dtype = obj.dtype.str + shape = None + chunkshape = (5,) + step = 1 + reopen = 0 # This case does not reopen files + + +class Basic5WriteTestCase(BasicTestCase): + obj = [1, 2] + type = np.asarray(obj).dtype.name + dtype = np.asarray(obj).dtype.str + shape = (0,) + chunkshape = (5,) + step = 1 + reopen = 1 # This case does reopen files + + +class Basic6WriteTestCase(BasicTestCase): + obj = np.array([1, 2]) + type = obj.dtype.name + dtype = obj.dtype.str + shape = None + chunkshape = (5,) + step = 1 + reopen = 1 # This case does reopen files + + +class Basic7WriteTestCase(BasicTestCase): + obj = [[1, 2], [3, 4]] + type = np.asarray(obj).dtype.name + dtype = np.asarray(obj).dtype.str + shape = (0, 2) + chunkshape = (5,) + step = 1 + reopen = 0 # This case does not reopen files + + +class Basic8WriteTestCase(BasicTestCase): + obj = [[1, 2], [3, 4]] + type = np.asarray(obj).dtype.name + dtype = np.asarray(obj).dtype.str + shape = (0, 2) + chunkshape = (5,) + step = 1 + reopen = 1 # This case does reopen files + + +class EmptyEArrayTestCase(BasicTestCase): + type = 'int32' + dtype = np.dtype('int32') + shape = (2, 0) + chunksize = 5 + nappends = 0 + start = 0 + stop = 10 + step = 1 + + +class NP_EmptyEArrayTestCase(BasicTestCase): + type = 'int32' + dtype = np.dtype('()int32') + shape = (2, 0) + chunksize = 5 + nappends = 0 + + +class Empty2EArrayTestCase(BasicTestCase): + type = 'int32' + dtype = 'int32' + shape = (2, 0) + chunksize = 5 + nappends = 0 + start = 0 + stop = 10 + step = 1 + reopen = 0 # This case does not reopen files + + +@common.unittest.skipIf(not common.lzo_avail, + 'LZO compression library not available') +class SlicesEArrayTestCase(BasicTestCase): + compress = 1 + complib = "lzo" + type = 'int32' + shape = (2, 0) + chunksize = 5 + nappends = 2 + slices = (slice(1, 2, 1), slice(1, 3, 1)) + + +@common.unittest.skipIf(not common.blosc_avail, + 'BLOSC compression library not available') +class Slices2EArrayTestCase(BasicTestCase): + compress = 1 + complib = "blosc" + type = 'int32' + shape = (2, 0, 4) + chunksize = 5 + nappends = 20 + slices = (slice(1, 2, 1), slice(None, None, None), slice(1, 4, 2)) + + +class EllipsisEArrayTestCase(BasicTestCase): + type = 'int32' + shape = (2, 0) + chunksize = 5 + nappends = 2 + # slices = (slice(1,2,1), Ellipsis) + slices = (Ellipsis, slice(1, 2, 1)) + + +class Ellipsis2EArrayTestCase(BasicTestCase): + type = 'int32' + shape = (2, 0, 4) + chunksize = 5 + nappends = 20 + slices = (slice(1, 2, 1), Ellipsis, slice(1, 4, 2)) + + +@common.unittest.skipIf(not common.blosc_avail, + 'BLOSC compression library not available') +class Slices3EArrayTestCase(BasicTestCase): + compress = 1 # To show the chunks id DEBUG is on + complib = "blosc" + type = 'int32' + shape = (2, 3, 4, 0) + chunksize = 5 + nappends = 20 + slices = (slice(1, 2, 1), slice(0, None, None), + slice(1, 4, 2)) # Don't work + # slices = (slice(None, None, None), slice(0, None, None), + # slice(1,4,1)) # W + # slices = (slice(None, None, None), slice(None, None, None), + # slice(1,4,2)) # N + # slices = (slice(1,2,1), slice(None, None, None), slice(1,4,2)) # N + # Disable the failing test temporarily with a working test case + slices = (slice(1, 2, 1), slice(1, 4, None), slice(1, 4, 2)) # Y + # slices = (slice(1,2,1), slice(0, 4, None), slice(1,4,1)) # Y + slices = (slice(1, 2, 1), slice(0, 4, None), slice(1, 4, 2)) # N + # slices = (slice(1,2,1), slice(0, 4, None), slice(1,4,2), + # slice(0,100,1)) # N + + +class Slices4EArrayTestCase(BasicTestCase): + type = 'int32' + shape = (2, 3, 4, 0, 5, 6) + chunksize = 5 + nappends = 20 + slices = (slice(1, 2, 1), slice(0, None, None), slice(1, 4, 2), + slice(0, 4, 2), slice(3, 5, 2), slice(2, 7, 1)) + + +class Ellipsis3EArrayTestCase(BasicTestCase): + type = 'int32' + shape = (2, 3, 4, 0) + chunksize = 5 + nappends = 20 + slices = (Ellipsis, slice(0, 4, None), slice(1, 4, 2)) + slices = (slice(1, 2, 1), slice(0, 4, None), slice(1, 4, 2), Ellipsis) + + +class Ellipsis4EArrayTestCase(BasicTestCase): + type = 'int32' + shape = (2, 3, 4, 0) + chunksize = 5 + nappends = 20 + slices = (Ellipsis, slice(0, 4, None), slice(1, 4, 2)) + slices = (slice(1, 2, 1), Ellipsis, slice(1, 4, 2)) + + +class Ellipsis5EArrayTestCase(BasicTestCase): + type = 'int32' + shape = (2, 3, 4, 0) + chunksize = 5 + nappends = 20 + slices = (slice(1, 2, 1), slice(0, 4, None), Ellipsis) + + +class Ellipsis6EArrayTestCase(BasicTestCase): + type = 'int32' + shape = (2, 3, 4, 0) + chunksize = 5 + nappends = 2 + # The next slices gives problems with setting values (test05) + # This is a problem on the test design, not the Array.__setitem__ + # code, though. + slices = (slice(1, 2, 1), slice(0, 4, None), 2, Ellipsis) + + +class Ellipsis7EArrayTestCase(BasicTestCase): + type = 'int32' + shape = (2, 3, 4, 0) + chunksize = 5 + nappends = 2 + slices = (slice(1, 2, 1), slice(0, 4, None), slice(2, 3), Ellipsis) + + +class MD3WriteTestCase(BasicTestCase): + type = 'int32' + shape = (2, 0, 3) + chunksize = 4 + step = 2 + + +class MD5WriteTestCase(BasicTestCase): + type = 'int32' + shape = (2, 0, 3, 4, 5) # ok + # shape = (1, 1, 0, 1) # Minimum shape that shows problems with HDF5 1.6.1 + # shape = (2, 3, 0, 4, 5) # Floating point exception (HDF5 1.6.1) + # shape = (2, 3, 3, 0, 5, 6) # Segmentation fault (HDF5 1.6.1) + chunksize = 1 + nappends = 1 + start = 1 + stop = 10 + step = 10 + + +class MD6WriteTestCase(BasicTestCase): + type = 'int32' + shape = (2, 3, 3, 0, 5, 6) + chunksize = 1 + nappends = 10 + start = 1 + stop = 10 + step = 3 + + +class NP_MD6WriteTestCase(BasicTestCase): + """Testing NumPy scalars as indexes""" + type = 'int32' + shape = (2, 3, 3, 0, 5, 6) + chunksize = 1 + nappends = 10 + + +class MD6WriteTestCase__(BasicTestCase): + type = 'int32' + shape = (2, 0) + chunksize = 1 + nappends = 3 + start = 1 + stop = 3 + step = 1 + + +class MD7WriteTestCase(BasicTestCase): + type = 'int32' + shape = (2, 3, 3, 4, 5, 0, 3) + chunksize = 10 + nappends = 1 + start = 1 + stop = 10 + step = 2 + + +class MD10WriteTestCase(BasicTestCase): + type = 'int32' + shape = (1, 2, 3, 4, 5, 5, 4, 3, 2, 0) + chunksize = 5 + nappends = 10 + start = -1 + stop = -1 + step = 10 + + +class NP_MD10WriteTestCase(BasicTestCase): + type = 'int32' + shape = (1, 2, 3, 4, 5, 5, 4, 3, 2, 0) + chunksize = 5 + nappends = 10 + + +class ZlibComprTestCase(BasicTestCase): + compress = 1 + complib = "zlib" + start = 3 + # stop = 0 # means last row + stop = None # means last row from 0.8 on + step = 10 + + +class ZlibShuffleTestCase(BasicTestCase): + shuffle = 1 + compress = 1 + complib = "zlib" + # case start < stop , i.e. no rows read + start = 3 + stop = 1 + step = 10 + + +@common.unittest.skipIf(not common.blosc_avail, + 'BLOSC compression library not available') +class BloscComprTestCase(BasicTestCase): + compress = 1 # sss + complib = "blosc" + chunksize = 10 + nappends = 100 + start = 3 + stop = 10 + step = 3 + + +@common.unittest.skipIf(not common.blosc_avail, + 'BLOSC compression library not available') +class BloscShuffleTestCase(BasicTestCase): + compress = 1 + shuffle = 1 + complib = "blosc" + chunksize = 100 + nappends = 10 + start = 3 + stop = 10 + step = 7 + + +@common.unittest.skipIf(not common.lzo_avail, + 'LZO compression library not available') +class LZOComprTestCase(BasicTestCase): + compress = 1 # sss + complib = "lzo" + chunksize = 10 + nappends = 100 + start = 3 + stop = 10 + step = 3 + + +@common.unittest.skipIf(not common.lzo_avail, + 'LZO compression library not available') +class LZOShuffleTestCase(BasicTestCase): + compress = 1 + shuffle = 1 + complib = "lzo" + chunksize = 100 + nappends = 10 + start = 3 + stop = 10 + step = 7 + + +@common.unittest.skipIf(not common.bzip2_avail, + 'BZIP2 compression library not available') +class Bzip2ComprTestCase(BasicTestCase): + compress = 1 + complib = "bzip2" + chunksize = 100 + nappends = 10 + start = 3 + stop = 10 + step = 8 + + +@common.unittest.skipIf(not common.bzip2_avail, + 'BZIP2 compression library not available') +class Bzip2ShuffleTestCase(BasicTestCase): + compress = 1 + shuffle = 1 + complib = "bzip2" + chunksize = 100 + nappends = 10 + start = 3 + stop = 10 + step = 6 + + +class Fletcher32TestCase(BasicTestCase): + compress = 0 + fletcher32 = 1 + chunksize = 50 + nappends = 20 + start = 4 + stop = 20 + step = 7 + + +class AllFiltersTestCase(BasicTestCase): + compress = 1 + shuffle = 1 + fletcher32 = 1 + complib = "zlib" + chunksize = 20 # sss + nappends = 50 + start = 2 + stop = 99 + step = 6 +# chunksize = 3 +# nappends = 2 +# start = 1 +# stop = 10 +# step = 2 + + +class FloatTypeTestCase(BasicTestCase): + type = 'float64' + dtype = 'float64' + shape = (2, 0) + chunksize = 5 + nappends = 10 + start = 3 + stop = 10 + step = 20 + + +class ComplexTypeTestCase(BasicTestCase): + type = 'complex128' + dtype = 'complex128' + shape = (2, 0) + chunksize = 5 + nappends = 10 + start = 3 + stop = 10 + step = 20 + + +class StringTestCase(BasicTestCase): + type = "string" + length = 20 + shape = (2, 0) + # shape = (2,0,20) + chunksize = 5 + nappends = 10 + start = 3 + stop = 10 + step = 20 + slices = (slice(0, 1), slice(1, 2)) + + +class String2TestCase(BasicTestCase): + type = "string" + length = 20 + shape = (0,) + # shape = (0, 20) + chunksize = 5 + nappends = 10 + start = 1 + stop = 10 + step = 2 + + +class StringComprTestCase(BasicTestCase): + type = "string" + length = 20 + shape = (20, 0, 10) + # shape = (20,0,10,20) + compr = 1 + # shuffle = 1 # this shouldn't do nothing on chars + chunksize = 50 + nappends = 10 + start = -1 + stop = 100 + step = 20 + + +class SizeOnDiskInMemoryPropertyTestCase(common.TempFileMixin, + common.PyTablesTestCase): + + def setUp(self): + super().setUp() + + self.array_size = (0, 10) + # set chunkshape so it divides evenly into array_size, to avoid + # partially filled chunks + self.chunkshape = (1000, 10) + # approximate size (in bytes) of non-data portion of hdf5 file + self.hdf_overhead = 6000 + + def create_array(self, complevel): + filters = tb.Filters(complevel=complevel, complib='blosc') + self.array = self.h5file.create_earray('/', 'earray', + atom=tb.Int32Atom(), + shape=self.array_size, + filters=filters, + chunkshape=self.chunkshape) + + def test_zero_length(self): + complevel = 0 + self.create_array(complevel) + self.assertEqual(self.array.size_on_disk, 0) + self.assertEqual(self.array.size_in_memory, 0) + + # add 10 chunks of data in one append + def test_no_compression_one_append(self): + complevel = 0 + self.create_array(complevel) + self.array.append([tuple(range(10))] * self.chunkshape[0] * 10) + self.assertEqual(self.array.size_on_disk, 10 * 1000 * 10 * 4) + self.assertEqual(self.array.size_in_memory, 10 * 1000 * 10 * 4) + + # add 10 chunks of data in two appends + def test_no_compression_multiple_appends(self): + complevel = 0 + self.create_array(complevel) + self.array.append([tuple(range(10))] * self.chunkshape[0] * 5) + self.array.append([tuple(range(10))] * self.chunkshape[0] * 5) + self.assertEqual(self.array.size_on_disk, 10 * 1000 * 10 * 4) + self.assertEqual(self.array.size_in_memory, 10 * 1000 * 10 * 4) + + def test_with_compression(self): + complevel = 1 + self.create_array(complevel) + self.array.append([tuple(range(10))] * self.chunkshape[0] * 10) + file_size = Path(self.h5fname).stat().st_size + self.assertTrue( + abs(self.array.size_on_disk - file_size) <= self.hdf_overhead) + self.assertEqual(self.array.size_in_memory, 10 * 1000 * 10 * 4) + self.assertLess(self.array.size_on_disk, self.array.size_in_memory) + + +class OffsetStrideTestCase(common.TempFileMixin, common.PyTablesTestCase): + mode = "w" + compress = 0 + complib = "zlib" # Default compression library + + def setUp(self): + super().setUp() + self.rootgroup = self.h5file.root + + def test01a_String(self): + """Checking earray with offseted numpy strings appends.""" + + root = self.rootgroup + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test01a_StringAtom..." % self.__class__.__name__) + + earray = self.h5file.create_earray(root, 'strings', + atom=tb.StringAtom(itemsize=3), + shape=(0, 2, 2), + title="Array of strings") + a = np.array([[["a", "b"], ["123", "45"], ["45", "123"]]], dtype="S3") + earray.append(a[:, 1:]) + a = np.array([[["s", "a"], ["ab", "f"], ["s", "abc"], ["abc", "f"]]]) + earray.append(a[:, 2:]) + + # Read all the rows: + row = earray.read() + if common.verbose: + print("Object read:", row) + print("Nrows in", earray._v_pathname, ":", earray.nrows) + print("Second row in earray ==>", row[1].tolist()) + + self.assertEqual(earray.nrows, 2) + self.assertEqual(row[0].tolist(), [[b"123", b"45"], [b"45", b"123"]]) + self.assertEqual(row[1].tolist(), [[b"s", b"abc"], [b"abc", b"f"]]) + self.assertEqual(len(row[0]), 2) + self.assertEqual(len(row[1]), 2) + + def test01b_String(self): + """Checking earray with strided numpy strings appends.""" + + root = self.rootgroup + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test01b_StringAtom..." % self.__class__.__name__) + + earray = self.h5file.create_earray(root, 'strings', + atom=tb.StringAtom(itemsize=3), + shape=(0, 2, 2), + title="Array of strings") + a = np.array([[["a", "b"], ["123", "45"], ["45", "123"]]], dtype="S3") + earray.append(a[:, ::2]) + a = np.array([[["s", "a"], ["ab", "f"], ["s", "abc"], ["abc", "f"]]]) + earray.append(a[:, ::2]) + + # Read all the rows: + row = earray.read() + if common.verbose: + print("Object read:", row) + print("Nrows in", earray._v_pathname, ":", earray.nrows) + print("Second row in earray ==>", row[1].tolist()) + + self.assertEqual(earray.nrows, 2) + self.assertEqual(row[0].tolist(), [[b"a", b"b"], [b"45", b"123"]]) + self.assertEqual(row[1].tolist(), [[b"s", b"a"], [b"s", b"abc"]]) + self.assertEqual(len(row[0]), 2) + self.assertEqual(len(row[1]), 2) + + def test02a_int(self): + """Checking earray with offseted NumPy ints appends.""" + + root = self.rootgroup + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test02a_int..." % self.__class__.__name__) + + # Create an string atom + earray = self.h5file.create_earray(root, 'EAtom', + atom=tb.Int32Atom(), shape=(0, 3), + title="array of ints") + a = np.array([(0, 0, 0), (1, 0, 3), (1, 1, 1), (0, 0, 0)], + dtype='int32') + earray.append(a[2:]) # Create an offset + a = np.array([(1, 1, 1), (-1, 0, 0)], dtype='int32') + earray.append(a[1:]) # Create an offset + + # Read all the rows: + row = earray.read() + if common.verbose: + print("Object read:", row) + print("Nrows in", earray._v_pathname, ":", earray.nrows) + print("Third row in vlarray ==>", row[2]) + + self.assertEqual(earray.nrows, 3) + self.assertTrue(common.allequal( + row[0], np.array([1, 1, 1], dtype='int32'))) + self.assertTrue(common.allequal( + row[1], np.array([0, 0, 0], dtype='int32'))) + self.assertTrue(common.allequal( + row[2], np.array([-1, 0, 0], dtype='int32'))) + + def test02b_int(self): + """Checking earray with strided NumPy ints appends.""" + + root = self.rootgroup + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test02b_int..." % self.__class__.__name__) + + earray = self.h5file.create_earray(root, 'EAtom', + atom=tb.Int32Atom(), shape=(0, 3), + title="array of ints") + a = np.array([(0, 0, 0), (1, 0, 3), (1, 1, 1), (3, 3, 3)], + dtype='int32') + earray.append(a[::3]) # Create an offset + a = np.array([(1, 1, 1), (-1, 0, 0)], dtype='int32') + earray.append(a[::2]) # Create an offset + + # Read all the rows: + row = earray.read() + if common.verbose: + print("Object read:", row) + print("Nrows in", earray._v_pathname, ":", earray.nrows) + print("Third row in vlarray ==>", row[2]) + + self.assertEqual(earray.nrows, 3) + self.assertTrue(common.allequal( + row[0], np.array([0, 0, 0], dtype='int32'))) + self.assertTrue(common.allequal( + row[1], np.array([3, 3, 3], dtype='int32'))) + self.assertTrue(common.allequal( + row[2], np.array([1, 1, 1], dtype='int32'))) + + def test03a_int(self): + """Checking earray with byteswapped appends (ints)""" + + root = self.rootgroup + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test03a_int..." % self.__class__.__name__) + + earray = self.h5file.create_earray(root, 'EAtom', + atom=tb.Int32Atom(), shape=(0, 3), + title="array of ints") + # Add a native ordered array + a = np.array([(0, 0, 0), (1, 0, 3), (1, 1, 1), (3, 3, 3)], + dtype='int32') + earray.append(a) + # Change the byteorder of the array + a = a.byteswap() + a = a.newbyteorder() + # Add a byteswapped array + earray.append(a) + + # Read all the rows: + native = earray[:4, :] + swapped = earray[4:, :] + if common.verbose: + print("Native rows:", native) + print("Byteorder native rows:", native.dtype.byteorder) + print("Swapped rows:", swapped) + print("Byteorder swapped rows:", swapped.dtype.byteorder) + + self.assertTrue(common.allequal(native, swapped)) + + def test03b_float(self): + """Checking earray with byteswapped appends (floats)""" + + root = self.rootgroup + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test03b_float..." % self.__class__.__name__) + + earray = self.h5file.create_earray(root, 'EAtom', + atom=tb.Float64Atom(), shape=(0, 3), + title="array of floats") + # Add a native ordered array + a = np.array([(0, 0, 0), (1, 0, 3), (1, 1, 1), (3, 3, 3)], + dtype='float64') + earray.append(a) + # Change the byteorder of the array + a = a.byteswap() + a = a.newbyteorder() + # Add a byteswapped array + earray.append(a) + + # Read all the rows: + native = earray[:4, :] + swapped = earray[4:, :] + if common.verbose: + print("Native rows:", native) + print("Byteorder native rows:", native.dtype.byteorder) + print("Swapped rows:", swapped) + print("Byteorder swapped rows:", swapped.dtype.byteorder) + + self.assertTrue(common.allequal(native, swapped)) + + def test04a_int(self): + """Checking earray with byteswapped appends (2, ints)""" + + root = self.rootgroup + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test04a_int..." % self.__class__.__name__) + + byteorder = {'little': 'big', 'big': 'little'}[sys.byteorder] + earray = self.h5file.create_earray(root, 'EAtom', + atom=tb.Int32Atom(), shape=(0, 3), + title="array of ints", + byteorder=byteorder) + # Add a native ordered array + a = np.array([(0, 0, 0), (1, 0, 3), (1, 1, 1), (3, 3, 3)], + dtype='int32') + earray.append(a) + # Change the byteorder of the array + a = a.byteswap() + a = a.newbyteorder() + # Add a byteswapped array + earray.append(a) + + # Read all the rows: + native = earray[:4, :] + swapped = earray[4:, :] + if common.verbose: + print("Byteorder native rows:", + tb.utils.byteorders[native.dtype.byteorder]) + print("Byteorder earray on-disk:", earray.byteorder) + + self.assertEqual(tb.utils.byteorders[native.dtype.byteorder], + sys.byteorder) + self.assertEqual(earray.byteorder, byteorder) + self.assertTrue(common.allequal(native, swapped)) + + def test04b_int(self): + """Checking earray with byteswapped appends (2, ints, reopen)""" + + root = self.rootgroup + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test04b_int..." % self.__class__.__name__) + + byteorder = {'little': 'big', 'big': 'little'}[sys.byteorder] + earray = self.h5file.create_earray(root, 'EAtom', + atom=tb.Int32Atom(), shape=(0, 3), + title="array of ints", + byteorder=byteorder) + self._reopen(mode="a") + earray = self.h5file.get_node("/EAtom") + # Add a native ordered array + a = np.array([(0, 0, 0), (1, 0, 3), (1, 1, 1), (3, 3, 3)], + dtype='int32') + earray.append(a) + # Change the byteorder of the array + a = a.byteswap() + a = a.newbyteorder() + # Add a byteswapped array + earray.append(a) + + # Read all the rows: + native = earray[:4, :] + swapped = earray[4:, :] + if common.verbose: + print("Byteorder native rows:", + tb.utils.byteorders[native.dtype.byteorder]) + print("Byteorder earray on-disk:", earray.byteorder) + + self.assertEqual(tb.utils.byteorders[native.dtype.byteorder], + sys.byteorder) + self.assertEqual(earray.byteorder, byteorder) + self.assertTrue(common.allequal(native, swapped)) + + def test04c_float(self): + """Checking earray with byteswapped appends (2, floats)""" + + root = self.rootgroup + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test04c_float..." % self.__class__.__name__) + + byteorder = {'little': 'big', 'big': 'little'}[sys.byteorder] + earray = self.h5file.create_earray(root, 'EAtom', + atom=tb.Float64Atom(), shape=(0, 3), + title="array of floats", + byteorder=byteorder) + # Add a native ordered array + a = np.array([(0, 0, 0), (1, 0, 3), (1, 1, 1), (3, 3, 3)], + dtype='float64') + earray.append(a) + # Change the byteorder of the array + a = a.byteswap() + a = a.newbyteorder() + # Add a byteswapped array + earray.append(a) + + # Read all the rows: + native = earray[:4, :] + swapped = earray[4:, :] + if common.verbose: + print("Byteorder native rows:", + tb.utils.byteorders[native.dtype.byteorder]) + print("Byteorder earray on-disk:", earray.byteorder) + + self.assertEqual(tb.utils.byteorders[native.dtype.byteorder], + sys.byteorder) + self.assertEqual(earray.byteorder, byteorder) + self.assertTrue(common.allequal(native, swapped)) + + def test04d_float(self): + """Checking earray with byteswapped appends (2, floats, reopen)""" + + root = self.rootgroup + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test04d_float..." % self.__class__.__name__) + + byteorder = {'little': 'big', 'big': 'little'}[sys.byteorder] + earray = self.h5file.create_earray(root, 'EAtom', + atom=tb.Float64Atom(), shape=(0, 3), + title="array of floats", + byteorder=byteorder) + self._reopen(mode='a') + earray = self.h5file.get_node("/EAtom") + # Add a native ordered array + a = np.array([(0, 0, 0), (1, 0, 3), (1, 1, 1), (3, 3, 3)], + dtype='float64') + earray.append(a) + # Change the byteorder of the array + a = a.byteswap() + a = a.newbyteorder() + # Add a byteswapped array + earray.append(a) + + # Read all the rows: + native = earray[:4, :] + swapped = earray[4:, :] + if common.verbose: + print("Byteorder native rows:", + tb.utils.byteorders[native.dtype.byteorder]) + print("Byteorder earray on-disk:", earray.byteorder) + + self.assertEqual(tb.utils.byteorders[native.dtype.byteorder], + sys.byteorder) + self.assertEqual(earray.byteorder, byteorder) + self.assertTrue(common.allequal(native, swapped)) + + +class CopyTestCase(common.TempFileMixin, common.PyTablesTestCase): + + def test01_copy(self): + """Checking EArray.copy() method.""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test01_copy..." % self.__class__.__name__) + + # Create an EArray + atom = tb.Int16Atom() + array1 = self.h5file.create_earray(self.h5file.root, 'array1', + atom=atom, shape=(0, 2), + title="title array1") + array1.append(np.array([[456, 2], [3, 457]], dtype='int16')) + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen(mode='a') + array1 = self.h5file.root.array1 + + # Copy it to another location + array2 = array1.copy('/', 'array2') + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen() + array1 = self.h5file.root.array1 + array2 = self.h5file.root.array2 + + if common.verbose: + print("array1-->", array1.read()) + print("array2-->", array2.read()) + # print("dirs-->", dir(array1), dir(array2)) + print("attrs array1-->", repr(array1.attrs)) + print("attrs array2-->", repr(array2.attrs)) + + # Check that all the elements are equal + self.assertTrue(common.allequal(array1.read(), array2.read())) + + # Assert other properties in array + self.assertEqual(array1.nrows, array2.nrows) + self.assertEqual(array1.shape, array2.shape) + self.assertEqual(array1.extdim, array2.extdim) + self.assertEqual(array1.flavor, array2.flavor) + self.assertEqual(array1.atom.dtype, array2.atom.dtype) + self.assertEqual(array1.atom.type, array2.atom.type) + self.assertEqual(array1.atom.itemsize, array2.atom.itemsize) + self.assertEqual(array1.title, array2.title) + self.assertEqual(str(array1.atom), str(array2.atom)) + + def test02_copy(self): + """Checking EArray.copy() method (where specified)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test02_copy..." % self.__class__.__name__) + + # Create an EArray + atom = tb.Int16Atom() + array1 = self.h5file.create_earray(self.h5file.root, 'array1', + atom=atom, shape=(0, 2), + title="title array1") + array1.append(np.array([[456, 2], [3, 457]], dtype='int16')) + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen(mode='a') + array1 = self.h5file.root.array1 + + # Copy to another location + group1 = self.h5file.create_group("/", "group1") + array2 = array1.copy(group1, 'array2') + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen() + array1 = self.h5file.root.array1 + array2 = self.h5file.root.group1.array2 + + if common.verbose: + print("array1-->", array1.read()) + print("array2-->", array2.read()) + # print("dirs-->", dir(array1), dir(array2)) + print("attrs array1-->", repr(array1.attrs)) + print("attrs array2-->", repr(array2.attrs)) + + # Check that all the elements are equal + self.assertTrue(common.allequal(array1.read(), array2.read())) + + # Assert other properties in array + self.assertEqual(array1.nrows, array2.nrows) + self.assertEqual(array1.shape, array2.shape) + self.assertEqual(array1.extdim, array2.extdim) + self.assertEqual(array1.flavor, array2.flavor) + self.assertEqual(array1.atom.dtype, array2.atom.dtype) + self.assertEqual(array1.atom.type, array2.atom.type) + self.assertEqual(array1.atom.itemsize, array2.atom.itemsize) + self.assertEqual(array1.title, array2.title) + self.assertEqual(str(array1.atom), str(array2.atom)) + + def test03a_copy(self): + """Checking EArray.copy() method (python flavor)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test03b_copy..." % self.__class__.__name__) + + atom = tb.Int16Atom() + array1 = self.h5file.create_earray(self.h5file.root, 'array1', + atom=atom, shape=(0, 2), + title="title array1") + array1.flavor = "python" + array1.append(((456, 2), (3, 457))) + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen(mode='a') + array1 = self.h5file.root.array1 + + # Copy to another location + array2 = array1.copy('/', 'array2') + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen() + array1 = self.h5file.root.array1 + array2 = self.h5file.root.array2 + + if common.verbose: + print("attrs array1-->", repr(array1.attrs)) + print("attrs array2-->", repr(array2.attrs)) + + # Check that all elements are equal + self.assertEqual(array1.read(), array2.read()) + # Assert other properties in array + self.assertEqual(array1.nrows, array2.nrows) + self.assertEqual(array1.shape, array2.shape) + self.assertEqual(array1.extdim, array2.extdim) + self.assertEqual(array1.flavor, array2.flavor) # Very important here! + self.assertEqual(array1.atom.dtype, array2.atom.dtype) + self.assertEqual(array1.atom.type, array2.atom.type) + self.assertEqual(array1.atom.itemsize, array2.atom.itemsize) + self.assertEqual(array1.title, array2.title) + self.assertEqual(str(array1.atom), str(array2.atom)) + + def test03b_copy(self): + """Checking EArray.copy() method (python string flavor)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test03d_copy..." % self.__class__.__name__) + + atom = tb.StringAtom(itemsize=3) + array1 = self.h5file.create_earray(self.h5file.root, 'array1', + atom=atom, shape=(0, 2), + title="title array1") + array1.flavor = "python" + array1.append([["456", "2"], ["3", "457"]]) + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen(mode='a') + array1 = self.h5file.root.array1 + + # Copy to another location + array2 = array1.copy('/', 'array2') + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen() + array1 = self.h5file.root.array1 + array2 = self.h5file.root.array2 + + if common.verbose: + print("attrs array1-->", repr(array1.attrs)) + print("attrs array2-->", repr(array2.attrs)) + + # Check that all elements are equal + self.assertEqual(array1.read(), array2.read()) + + # Assert other properties in array + self.assertEqual(array1.nrows, array2.nrows) + self.assertEqual(array1.shape, array2.shape) + self.assertEqual(array1.extdim, array2.extdim) + self.assertEqual(array1.flavor, array2.flavor) # Very important here! + self.assertEqual(array1.atom.dtype, array2.atom.dtype) + self.assertEqual(array1.atom.type, array2.atom.type) + self.assertEqual(array1.atom.itemsize, array2.atom.itemsize) + self.assertEqual(array1.title, array2.title) + self.assertEqual(str(array1.atom), str(array2.atom)) + + def test03c_copy(self): + """Checking EArray.copy() method (String flavor)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test03e_copy..." % self.__class__.__name__) + + atom = tb.StringAtom(itemsize=4) + array1 = self.h5file.create_earray(self.h5file.root, 'array1', + atom=atom, shape=(0, 2), + title="title array1") + array1.flavor = "numpy" + array1.append(np.array([["456", "2"], ["3", "457"]], dtype="S4")) + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen(mode='a') + array1 = self.h5file.root.array1 + + # Copy to another location + array2 = array1.copy('/', 'array2') + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen() + array1 = self.h5file.root.array1 + array2 = self.h5file.root.array2 + + if common.verbose: + print("attrs array1-->", repr(array1.attrs)) + print("attrs array2-->", repr(array2.attrs)) + + # Check that all elements are equal + self.assertTrue(common.allequal(array1.read(), array2.read())) + # Assert other properties in array + self.assertEqual(array1.nrows, array2.nrows) + self.assertEqual(array1.shape, array2.shape) + self.assertEqual(array1.extdim, array2.extdim) + self.assertEqual(array1.flavor, array2.flavor) # Very important here! + self.assertEqual(array1.atom.dtype, array2.atom.dtype) + self.assertEqual(array1.atom.type, array2.atom.type) + self.assertEqual(array1.atom.itemsize, array2.atom.itemsize) + self.assertEqual(array1.title, array2.title) + self.assertEqual(str(array1.atom), str(array2.atom)) + + def test04_copy(self): + """Checking EArray.copy() method (checking title copying)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test04_copy..." % self.__class__.__name__) + + # Create an EArray + atom = tb.Int16Atom() + array1 = self.h5file.create_earray(self.h5file.root, 'array1', + atom=atom, shape=(0, 2), + title="title array1") + array1.append(np.array([[456, 2], [3, 457]], dtype='int16')) + # Append some user attrs + array1.attrs.attr1 = "attr1" + array1.attrs.attr2 = 2 + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen(mode='a') + array1 = self.h5file.root.array1 + + # Copy it to another Array + array2 = array1.copy('/', 'array2', title="title array2") + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen() + array1 = self.h5file.root.array1 + array2 = self.h5file.root.array2 + + # Assert user attributes + if common.verbose: + print("title of destination array-->", array2.title) + self.assertEqual(array2.title, "title array2") + + def test05_copy(self): + """Checking EArray.copy() method (user attributes copied)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test05_copy..." % self.__class__.__name__) + + # Create an EArray + atom = tb.Int16Atom() + array1 = self.h5file.create_earray(self.h5file.root, 'array1', + atom=atom, shape=(0, 2), + title="title array1") + array1.append(np.array([[456, 2], [3, 457]], dtype='int16')) + # Append some user attrs + array1.attrs.attr1 = "attr1" + array1.attrs.attr2 = 2 + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen(mode='a') + array1 = self.h5file.root.array1 + + # Copy it to another Array + array2 = array1.copy('/', 'array2', copyuserattrs=1) + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen() + array1 = self.h5file.root.array1 + array2 = self.h5file.root.array2 + + if common.verbose: + print("attrs array1-->", repr(array1.attrs)) + print("attrs array2-->", repr(array2.attrs)) + + # Assert user attributes + self.assertEqual(array2.attrs.attr1, "attr1") + self.assertEqual(array2.attrs.attr2, 2) + + def test05b_copy(self): + """Checking EArray.copy() method (user attributes not copied)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test05b_copy..." % self.__class__.__name__) + + # Create an Array + atom = tb.Int16Atom() + array1 = self.h5file.create_earray(self.h5file.root, 'array1', + atom=atom, shape=(0, 2), + title="title array1") + array1.append(np.array([[456, 2], [3, 457]], dtype='int16')) + # Append some user attrs + array1.attrs.attr1 = "attr1" + array1.attrs.attr2 = 2 + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen(mode='a') + array1 = self.h5file.root.array1 + + # Copy it to another Array + array2 = array1.copy('/', 'array2', copyuserattrs=0) + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen() + array1 = self.h5file.root.array1 + array2 = self.h5file.root.array2 + + if common.verbose: + print("attrs array1-->", repr(array1.attrs)) + print("attrs array2-->", repr(array2.attrs)) + + # Assert user attributes + self.assertEqual(hasattr(array2.attrs, "attr1"), 0) + self.assertEqual(hasattr(array2.attrs, "attr2"), 0) + + +class CloseCopyTestCase(CopyTestCase): + close = 1 + + +class OpenCopyTestCase(CopyTestCase): + close = 0 + + +class CopyIndexTestCase(common.TempFileMixin, common.PyTablesTestCase): + nrowsinbuf = 2 + + def test01_index(self): + """Checking EArray.copy() method with indexes.""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test01_index..." % self.__class__.__name__) + + # Create an EArray + atom = tb.Int32Atom() + array1 = self.h5file.create_earray(self.h5file.root, 'array1', + atom=atom, shape=(0, 2), + title="title array1") + r = np.arange(200, dtype='int32') + r.shape = (100, 2) + array1.append(r) + + # Select a different buffer size: + array1.nrowsinbuf = self.nrowsinbuf + + # Copy to another array + array2 = array1.copy("/", 'array2', + start=self.start, + stop=self.stop, + step=self.step) + if common.verbose: + print("array1-->", array1.read()) + print("array2-->", array2.read()) + print("attrs array1-->", repr(array1.attrs)) + print("attrs array2-->", repr(array2.attrs)) + + # Check that all the elements are equal + r2 = r[self.start:self.stop:self.step] + self.assertTrue(common.allequal(r2, array2.read())) + + # Assert the number of rows in array + if common.verbose: + print("nrows in array2-->", array2.nrows) + print("and it should be-->", r2.shape[0]) + self.assertEqual(r2.shape[0], array2.nrows) + + def test02_indexclosef(self): + """Checking EArray.copy() method with indexes (close file version)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test02_indexclosef..." % self.__class__.__name__) + + # Create an EArray + atom = tb.Int32Atom() + array1 = self.h5file.create_earray(self.h5file.root, 'array1', + atom=atom, shape=(0, 2), + title="title array1") + r = np.arange(200, dtype='int32') + r.shape = (100, 2) + array1.append(r) + + # Select a different buffer size: + array1.nrowsinbuf = self.nrowsinbuf + + # Copy to another array + array2 = array1.copy("/", 'array2', + start=self.start, + stop=self.stop, + step=self.step) + # Close and reopen the file + self._reopen() + array1 = self.h5file.root.array1 + array2 = self.h5file.root.array2 + + if common.verbose: + print("array1-->", array1.read()) + print("array2-->", array2.read()) + print("attrs array1-->", repr(array1.attrs)) + print("attrs array2-->", repr(array2.attrs)) + + # Check that all the elements are equal + r2 = r[self.start:self.stop:self.step] + self.assertTrue(common.allequal(r2, array2.read())) + + # Assert the number of rows in array + if common.verbose: + print("nrows in array2-->", array2.nrows) + print("and it should be-->", r2.shape[0]) + self.assertEqual(r2.shape[0], array2.nrows) + + +class CopyIndex1TestCase(CopyIndexTestCase): + nrowsinbuf = 1 + start = 0 + stop = 7 + step = 1 + + +class CopyIndex2TestCase(CopyIndexTestCase): + nrowsinbuf = 2 + start = 0 + stop = -1 + step = 1 + + +class CopyIndex3TestCase(CopyIndexTestCase): + nrowsinbuf = 3 + start = 1 + stop = 7 + step = 1 + + +class CopyIndex4TestCase(CopyIndexTestCase): + nrowsinbuf = 4 + start = 0 + stop = 6 + step = 1 + + +class CopyIndex5TestCase(CopyIndexTestCase): + nrowsinbuf = 2 + start = 3 + stop = 7 + step = 1 + + +class CopyIndex6TestCase(CopyIndexTestCase): + nrowsinbuf = 2 + start = 3 + stop = 6 + step = 2 + + +class CopyIndex7TestCase(CopyIndexTestCase): + start = 0 + stop = 7 + step = 10 + + +class CopyIndex8TestCase(CopyIndexTestCase): + start = 6 + stop = -1 # Negative values means starting from the end + step = 1 + + +class CopyIndex9TestCase(CopyIndexTestCase): + start = 3 + stop = 4 + step = 1 + + +class CopyIndex10TestCase(CopyIndexTestCase): + nrowsinbuf = 1 + start = 3 + stop = 4 + step = 2 + + +class CopyIndex11TestCase(CopyIndexTestCase): + start = -3 + stop = -1 + step = 2 + + +class CopyIndex12TestCase(CopyIndexTestCase): + start = -1 # Should point to the last element + stop = None # None should mean the last element (including it) + step = 1 + + +class TruncateTestCase(common.TempFileMixin, common.PyTablesTestCase): + + def setUp(self): + super().setUp() + + # Create an EArray + atom = tb.Int16Atom(dflt=3) + array1 = self.h5file.create_earray(self.h5file.root, 'array1', + atom=atom, shape=(0, 2), + title="title array1") + # Add a couple of rows + array1.append(np.array([[456, 2], [3, 457]], dtype='int16')) + + def test00_truncate(self): + """Checking EArray.truncate() method (truncating to 0 rows)""" + + array1 = self.h5file.root.array1 + # Truncate to 0 elements + array1.truncate(0) + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen() + array1 = self.h5file.root.array1 + + if common.verbose: + print("array1-->", array1.read()) + + self.assertTrue(common.allequal( + array1[:], np.array([], dtype='int16').reshape(0, 2))) + + def test01_truncate(self): + """Checking EArray.truncate() method (truncating to 1 rows)""" + + array1 = self.h5file.root.array1 + # Truncate to 1 element + array1.truncate(1) + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen() + array1 = self.h5file.root.array1 + + if common.verbose: + print("array1-->", array1.read()) + + self.assertTrue(common.allequal( + array1.read(), np.array([[456, 2]], dtype='int16'))) + + def test02_truncate(self): + """Checking EArray.truncate() method (truncating to == self.nrows)""" + + array1 = self.h5file.root.array1 + # Truncate to 2 elements + array1.truncate(2) + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen() + array1 = self.h5file.root.array1 + + if common.verbose: + print("array1-->", array1.read()) + + self.assertTrue(common.allequal( + array1.read(), np.array([[456, 2], [3, 457]], dtype='int16'))) + + def test03_truncate(self): + """Checking EArray.truncate() method (truncating to > self.nrows)""" + + array1 = self.h5file.root.array1 + # Truncate to 4 elements + array1.truncate(4) + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen() + array1 = self.h5file.root.array1 + + if common.verbose: + print("array1-->", array1.read()) + + self.assertEqual(array1.nrows, 4) + # Check the original values + self.assertTrue(common.allequal( + array1[:2], np.array([[456, 2], [3, 457]], dtype='int16'))) + # Check that the added rows have the default values + self.assertTrue(common.allequal( + array1[2:], np.array([[3, 3], [3, 3]], dtype='int16'))) + + +class TruncateOpenTestCase(TruncateTestCase): + close = 0 + + +class TruncateCloseTestCase(TruncateTestCase): + close = 1 + + +# The next test should be run only in **common.heavy** mode +class Rows64bitsTestCase(common.TempFileMixin, common.PyTablesTestCase): + open_mode = 'a' + narows = 1000 * 1000 # each numpy object will have 1 million entries + # narows = 1000 # for testing only + nanumber = 1000 * 3 # That should account for more than 2**31-1 + + def setUp(self): + super().setUp() + + # Create an EArray + array = self.h5file.create_earray( + self.h5file.root, 'array', + atom=tb.Int8Atom(), shape=(0,), + filters=tb.Filters(complib='lzo', complevel=1), + # Specifying expectedrows takes more + # CPU, but less disk + expectedrows=self.narows * self.nanumber) + + # Fill the array + na = np.arange(self.narows, dtype='int8') + for i in range(self.nanumber): + array.append(na) + + def test01_basiccheck(self): + """Some basic checks for earrays exceeding 2**31 rows""" + + array = self.h5file.root.array + + if self.close: + if common.verbose: + # Check how many entries there are in the array + print("Before closing") + print("Entries:", array.nrows, type(array.nrows)) + print("Entries:", array.nrows / (1000 * 1000), "Millions") + print("Shape:", array.shape) + + # Close the file + self._reopen() + + array = self.h5file.root.array + if common.verbose: + print("After re-open") + + # Check how many entries there are in the array + if common.verbose: + print("Entries:", array.nrows, type(array.nrows)) + print("Entries:", array.nrows / (1000 * 1000), "Millions") + print("Shape:", array.shape) + print("Last 10 elements-->", array[-10:]) + stop = self.narows % 256 + if stop > 127: + stop -= 256 + start = stop - 10 + print("Should look like-->", np.arange(start, stop, dtype='int8')) + + nrows = self.narows * self.nanumber + # check nrows + self.assertEqual(array.nrows, nrows) + # Check shape + self.assertEqual(array.shape, (nrows,)) + # check the 10 first elements + self.assertTrue(common.allequal( + array[:10], np.arange(10, dtype='int8'))) + # check the 10 last elements + stop = self.narows % 256 + if stop > 127: + stop -= 256 + start = stop - 10 + self.assertTrue(common.allequal( + array[-10:], np.arange(start, stop, dtype='int8'))) + + +class Rows64bitsTestCase1(Rows64bitsTestCase): + close = 0 + + +class Rows64bitsTestCase2(Rows64bitsTestCase): + close = 1 + + +# Test for appending zero-sized arrays +class ZeroSizedTestCase(common.TempFileMixin, common.PyTablesTestCase): + open_mode = 'a' + + def setUp(self): + super().setUp() + + # Create an EArray + ea = self.h5file.create_earray('/', 'test', + atom=tb.Int32Atom(), shape=(3, 0)) + # Append a single row + ea.append([[1], [2], [3]]) + + def test01_canAppend(self): + """Appending zero length array.""" + + fileh = self.h5file + ea = fileh.root.test + arr = np.empty(shape=(3, 0), dtype='int32') + ea.append(arr) + self.assertEqual(ea.nrows, 1, "The number of rows should be 1.") + + def test02_appendWithWrongShape(self): + """Appending zero length array with wrong dimension.""" + + fileh = self.h5file + ea = fileh.root.test + arr = np.empty(shape=(3, 0, 3), dtype='int32') + self.assertRaises(ValueError, ea.append, arr) + + +# Test for dealing with multidimensional atoms +class MDAtomTestCase(common.TempFileMixin, common.PyTablesTestCase): + + def test01a_append(self): + """Append a row to a (unidimensional) EArray with a MD tables.Atom.""" + + # Create an EArray + ea = self.h5file.create_earray('/', 'test', + atom=tb.Int32Atom((2, 2)), shape=(0,)) + if self.reopen: + self._reopen('a') + ea = self.h5file.root.test + # Append one row + ea.append([[[1, 3], [4, 5]]]) + self.assertEqual(ea.nrows, 1) + if common.verbose: + print("First row-->", ea[0]) + self.assertTrue(common.allequal( + ea[0], np.array([[1, 3], [4, 5]], 'i4'))) + + def test01b_append(self): + """Append several rows to a (unidimensional) EArray with a MD + tables.Atom.""" + + # Create an EArray + ea = self.h5file.create_earray('/', 'test', + atom=tb.Int32Atom((2, 2)), shape=(0,)) + if self.reopen: + self._reopen('a') + ea = self.h5file.root.test + # Append three rows + ea.append([[[1]], [[2]], [[3]]]) # Simple broadcast + self.assertEqual(ea.nrows, 3) + if common.verbose: + print("Third row-->", ea[2]) + self.assertTrue(common.allequal( + ea[2], np.array([[3, 3], [3, 3]], 'i4'))) + + def test02a_append(self): + """Append a row to a (multidimensional) EArray with a + MD tables.Atom.""" + + # Create an EArray + ea = self.h5file.create_earray('/', 'test', + atom=tb.Int32Atom((2,)), shape=(0, 3)) + if self.reopen: + self._reopen('a') + ea = self.h5file.root.test + # Append one row + ea.append([[[1, 3], [4, 5], [7, 9]]]) + self.assertEqual(ea.nrows, 1) + if common.verbose: + print("First row-->", ea[0]) + self.assertTrue(common.allequal( + ea[0], np.array([[1, 3], [4, 5], [7, 9]], 'i4'))) + + def test02b_append(self): + """Append several rows to a (multidimensional) EArray with a MD + tables.Atom.""" + + # Create an EArray + ea = self.h5file.create_earray('/', 'test', + atom=tb.Int32Atom((2,)), shape=(0, 3)) + if self.reopen: + self._reopen('a') + ea = self.h5file.root.test + # Append three rows + ea.append([[[1, -3], [4, -5], [-7, 9]], + [[-1, 3], [-4, 5], [7, -8]], + [[-2, 3], [-5, 5], [7, -9]]]) + self.assertEqual(ea.nrows, 3) + if common.verbose: + print("Third row-->", ea[2]) + self.assertTrue(common.allequal( + ea[2], np.array([[-2, 3], [-5, 5], [7, -9]], 'i4'))) + + def test03a_MDMDMD(self): + """Complex append of a MD array in a MD EArray with a + MD tables.Atom.""" + + # Create an EArray + ea = self.h5file.create_earray('/', 'test', atom=tb.Int32Atom((2, 4)), + shape=(0, 2, 3)) + if self.reopen: + self._reopen('a') + ea = self.h5file.root.test + # Append three rows + # The shape of the atom should be added at the end of the arrays + a = np.arange(2 * 3*2*4, dtype='i4').reshape((2, 3, 2, 4)) + ea.append([a * 1, a*2, a*3]) + self.assertEqual(ea.nrows, 3) + if common.verbose: + print("Third row-->", ea[2]) + self.assertTrue(common.allequal(ea[2], a * 3)) + + def test03b_MDMDMD(self): + """Complex append of a MD array in a MD EArray with a MD atom (II).""" + # Create an EArray + ea = self.h5file.create_earray('/', 'test', atom=tb.Int32Atom((2, 4)), + shape=(2, 0, 3)) + if self.reopen: + self._reopen('a') + ea = self.h5file.root.test + # Append three rows + # The shape of the atom should be added at the end of the arrays + a = np.arange(2 * 3*2*4, dtype='i4').reshape((2, 1, 3, 2, 4)) + ea.append(a * 1) + ea.append(a * 2) + ea.append(a * 3) + self.assertEqual(ea.nrows, 3) + if common.verbose: + print("Third row-->", ea[:, 2, ...]) + self.assertTrue(common.allequal(ea[:, 2, ...], + a.reshape((2, 3, 2, 4))*3)) + + def test03c_MDMDMD(self): + """Complex append of a MD array in a MD EArray with a MD atom (III).""" + # Create an EArray + ea = self.h5file.create_earray('/', 'test', atom=tb.Int32Atom((2, 4)), + shape=(2, 3, 0)) + if self.reopen: + self._reopen('a') + ea = self.h5file.root.test + # Append three rows + # The shape of the atom should be added at the end of the arrays + a = np.arange(2 * 3*2*4, dtype='i4').reshape((2, 3, 1, 2, 4)) + ea.append(a * 1) + ea.append(a * 2) + ea.append(a * 3) + self.assertEqual(ea.nrows, 3) + if common.verbose: + print("Third row-->", ea[:, :, 2, ...]) + self.assertTrue(common.allequal(ea[:, :, 2, ...], + a.reshape((2, 3, 2, 4))*3)) + + +class MDAtomNoReopen(MDAtomTestCase): + reopen = False + + +class MDAtomReopen(MDAtomTestCase): + reopen = True + + +class AccessClosedTestCase(common.TempFileMixin, common.PyTablesTestCase): + + def setUp(self): + super().setUp() + self.array = self.h5file.create_earray(self.h5file.root, 'array', + atom=tb.Int32Atom(), + shape=(0, 10)) + self.array.append(np.zeros((10, 10))) + + def test_read(self): + self.h5file.close() + self.assertRaises(tb.ClosedNodeError, self.array.read) + + def test_getitem(self): + self.h5file.close() + self.assertRaises(tb.ClosedNodeError, self.array.__getitem__, 0) + + def test_setitem(self): + self.h5file.close() + self.assertRaises(tb.ClosedNodeError, self.array.__setitem__, 0, 0) + + def test_append(self): + self.h5file.close() + self.assertRaises(tb.ClosedNodeError, + self.array.append, np.zeros((10, 10))) + + +class TestCreateEArrayArgs(common.TempFileMixin, common.PyTablesTestCase): + obj = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) + where = '/' + name = 'earray' + atom = tb.Atom.from_dtype(obj.dtype) + shape = (0,) + obj.shape[1:] + title = 'title' + filters = None + expectedrows = 1000 + chunkshape = (1, 2) + byteorder = None + createparents = False + + def test_positional_args_01(self): + self.h5file.create_earray(self.where, self.name, + self.atom, self.shape, + self.title, self.filters, + self.expectedrows, self.chunkshape) + + self._reopen() + + ptarr = self.h5file.get_node(self.where, self.name) + + self.assertEqual(ptarr.title, self.title) + self.assertEqual(ptarr.shape, self.shape) + self.assertEqual(ptarr.nrows, 0) + self.assertEqual(ptarr.atom, self.atom) + self.assertEqual(ptarr.atom.dtype, self.atom.dtype) + self.assertEqual(ptarr.chunkshape, self.chunkshape) + + def test_positional_args_02(self): + ptarr = self.h5file.create_earray(self.where, self.name, + self.atom, self.shape, + self.title, + self.filters, + self.expectedrows, + self.chunkshape) + ptarr.append(self.obj) + + self._reopen() + + ptarr = self.h5file.get_node(self.where, self.name) + nparr = ptarr.read() + + self.assertEqual(ptarr.title, self.title) + self.assertEqual(ptarr.shape, self.obj.shape) + self.assertEqual(ptarr.nrows, self.obj.shape[0]) + self.assertEqual(ptarr.atom, self.atom) + self.assertEqual(ptarr.atom.dtype, self.atom.dtype) + self.assertEqual(ptarr.chunkshape, self.chunkshape) + self.assertTrue(common.allequal(self.obj, nparr)) + + def test_positional_args_obj(self): + self.h5file.create_earray(self.where, self.name, + None, None, + self.title, + self.filters, + self.expectedrows, + self.chunkshape, + self.byteorder, + self.createparents, + self.obj) + + self._reopen() + + ptarr = self.h5file.get_node(self.where, self.name) + nparr = ptarr.read() + + self.assertEqual(ptarr.title, self.title) + self.assertEqual(ptarr.shape, self.obj.shape) + self.assertEqual(ptarr.nrows, self.obj.shape[0]) + self.assertEqual(ptarr.atom, self.atom) + self.assertEqual(ptarr.atom.dtype, self.atom.dtype) + self.assertEqual(ptarr.chunkshape, self.chunkshape) + self.assertTrue(common.allequal(self.obj, nparr)) + + def test_kwargs_obj(self): + self.h5file.create_earray(self.where, self.name, title=self.title, + chunkshape=self.chunkshape, + obj=self.obj) + + self._reopen() + + ptarr = self.h5file.get_node(self.where, self.name) + nparr = ptarr.read() + + self.assertEqual(ptarr.title, self.title) + self.assertEqual(ptarr.shape, self.obj.shape) + self.assertEqual(ptarr.nrows, self.obj.shape[0]) + self.assertEqual(ptarr.atom, self.atom) + self.assertEqual(ptarr.atom.dtype, self.atom.dtype) + self.assertEqual(ptarr.chunkshape, self.chunkshape) + self.assertTrue(common.allequal(self.obj, nparr)) + + def test_kwargs_atom_shape_01(self): + ptarr = self.h5file.create_earray(self.where, self.name, + title=self.title, + chunkshape=self.chunkshape, + atom=self.atom, shape=self.shape) + ptarr.append(self.obj) + + self._reopen() + + ptarr = self.h5file.get_node(self.where, self.name) + nparr = ptarr.read() + + self.assertEqual(ptarr.title, self.title) + self.assertEqual(ptarr.shape, self.obj.shape) + self.assertEqual(ptarr.nrows, self.obj.shape[0]) + self.assertEqual(ptarr.atom, self.atom) + self.assertEqual(ptarr.atom.dtype, self.atom.dtype) + self.assertEqual(ptarr.chunkshape, self.chunkshape) + self.assertTrue(common.allequal(self.obj, nparr)) + + def test_kwargs_atom_shape_02(self): + ptarr = self.h5file.create_earray(self.where, self.name, + title=self.title, + chunkshape=self.chunkshape, + atom=self.atom, shape=self.shape) + # ptarr.append(self.obj) + + self._reopen() + + ptarr = self.h5file.get_node(self.where, self.name) + + self.assertEqual(ptarr.title, self.title) + self.assertEqual(ptarr.shape, self.shape) + self.assertEqual(ptarr.nrows, 0) + self.assertEqual(ptarr.atom, self.atom) + self.assertEqual(ptarr.atom.dtype, self.atom.dtype) + self.assertEqual(ptarr.chunkshape, self.chunkshape) + + def test_kwargs_obj_atom(self): + ptarr = self.h5file.create_earray(self.where, self.name, + title=self.title, + chunkshape=self.chunkshape, + obj=self.obj, + atom=self.atom) + + self._reopen() + + ptarr = self.h5file.get_node(self.where, self.name) + nparr = ptarr.read() + + self.assertEqual(ptarr.title, self.title) + self.assertEqual(ptarr.shape, self.obj.shape) + self.assertEqual(ptarr.nrows, self.obj.shape[0]) + self.assertEqual(ptarr.atom, self.atom) + self.assertEqual(ptarr.atom.dtype, self.atom.dtype) + self.assertEqual(ptarr.chunkshape, self.chunkshape) + self.assertTrue(common.allequal(self.obj, nparr)) + + def test_kwargs_obj_shape(self): + ptarr = self.h5file.create_earray(self.where, self.name, + title=self.title, + chunkshape=self.chunkshape, + obj=self.obj, + shape=self.shape) + + self._reopen() + + ptarr = self.h5file.get_node(self.where, self.name) + nparr = ptarr.read() + + self.assertEqual(ptarr.title, self.title) + self.assertEqual(ptarr.shape, self.obj.shape) + self.assertEqual(ptarr.nrows, self.obj.shape[0]) + self.assertEqual(ptarr.atom, self.atom) + self.assertEqual(ptarr.atom.dtype, self.atom.dtype) + self.assertEqual(ptarr.chunkshape, self.chunkshape) + self.assertTrue(common.allequal(self.obj, nparr)) + + def test_kwargs_obj_atom_shape(self): + ptarr = self.h5file.create_earray(self.where, self.name, + title=self.title, + chunkshape=self.chunkshape, + obj=self.obj, + atom=self.atom, + shape=self.shape) + + self._reopen() + + ptarr = self.h5file.get_node(self.where, self.name) + nparr = ptarr.read() + + self.assertEqual(ptarr.title, self.title) + self.assertEqual(ptarr.shape, self.obj.shape) + self.assertEqual(ptarr.nrows, self.obj.shape[0]) + self.assertEqual(ptarr.atom, self.atom) + self.assertEqual(ptarr.atom.dtype, self.atom.dtype) + self.assertEqual(ptarr.chunkshape, self.chunkshape) + self.assertTrue(common.allequal(self.obj, nparr)) + + def test_kwargs_obj_atom_error(self): + atom = tb.Atom.from_dtype(np.dtype('complex')) + # shape = self.shape + self.shape + self.assertRaises(TypeError, + self.h5file.create_earray, + self.where, + self.name, + title=self.title, + obj=self.obj, + atom=atom) + + def test_kwargs_obj_shape_error(self): + # atom = tables.Atom.from_dtype(numpy.dtype('complex')) + shape = self.shape + self.shape + self.assertRaises(TypeError, + self.h5file.create_earray, + self.where, + self.name, + title=self.title, + obj=self.obj, + shape=shape) + + def test_kwargs_obj_atom_shape_error_01(self): + atom = tb.Atom.from_dtype(np.dtype('complex')) + # shape = self.shape + self.shape + self.assertRaises(TypeError, + self.h5file.create_earray, + self.where, + self.name, + title=self.title, + obj=self.obj, + atom=atom, + shape=self.shape) + + def test_kwargs_obj_atom_shape_error_02(self): + # atom = tables.Atom.from_dtype(numpy.dtype('complex')) + shape = self.shape + self.shape + self.assertRaises(TypeError, + self.h5file.create_earray, + self.where, + self.name, + title=self.title, + obj=self.obj, + atom=self.atom, + shape=shape) + + def test_kwargs_obj_atom_shape_error_03(self): + atom = tb.Atom.from_dtype(np.dtype('complex')) + shape = self.shape + self.shape + self.assertRaises(TypeError, + self.h5file.create_earray, + self.where, + self.name, + title=self.title, + obj=self.obj, + atom=atom, + shape=shape) + + +def suite(): + theSuite = common.unittest.TestSuite() + niter = 1 + # common.heavy = 1 # uncomment this only for testing purposes + + # theSuite.addTest(unittest.makeSuite(BasicWriteTestCase)) + # theSuite.addTest(unittest.makeSuite(Rows64bitsTestCase1)) + # theSuite.addTest(unittest.makeSuite(Rows64bitsTestCase2)) + for n in range(niter): + theSuite.addTest(common.unittest.makeSuite(BasicWriteTestCase)) + theSuite.addTest(common.unittest.makeSuite(Basic2WriteTestCase)) + theSuite.addTest(common.unittest.makeSuite(Basic3WriteTestCase)) + theSuite.addTest(common.unittest.makeSuite(Basic4WriteTestCase)) + theSuite.addTest(common.unittest.makeSuite(Basic5WriteTestCase)) + theSuite.addTest(common.unittest.makeSuite(Basic6WriteTestCase)) + theSuite.addTest(common.unittest.makeSuite(Basic7WriteTestCase)) + theSuite.addTest(common.unittest.makeSuite(Basic8WriteTestCase)) + theSuite.addTest(common.unittest.makeSuite(EmptyEArrayTestCase)) + theSuite.addTest(common.unittest.makeSuite(Empty2EArrayTestCase)) + theSuite.addTest(common.unittest.makeSuite(SlicesEArrayTestCase)) + theSuite.addTest(common.unittest.makeSuite(Slices2EArrayTestCase)) + theSuite.addTest(common.unittest.makeSuite(EllipsisEArrayTestCase)) + theSuite.addTest(common.unittest.makeSuite(Ellipsis2EArrayTestCase)) + theSuite.addTest(common.unittest.makeSuite(Ellipsis3EArrayTestCase)) + theSuite.addTest(common.unittest.makeSuite(ZlibComprTestCase)) + theSuite.addTest(common.unittest.makeSuite(ZlibShuffleTestCase)) + theSuite.addTest(common.unittest.makeSuite(BloscComprTestCase)) + theSuite.addTest(common.unittest.makeSuite(BloscShuffleTestCase)) + theSuite.addTest(common.unittest.makeSuite(LZOComprTestCase)) + theSuite.addTest(common.unittest.makeSuite(LZOShuffleTestCase)) + theSuite.addTest(common.unittest.makeSuite(Bzip2ComprTestCase)) + theSuite.addTest(common.unittest.makeSuite(Bzip2ShuffleTestCase)) + theSuite.addTest(common.unittest.makeSuite(FloatTypeTestCase)) + theSuite.addTest(common.unittest.makeSuite(ComplexTypeTestCase)) + theSuite.addTest(common.unittest.makeSuite(StringTestCase)) + theSuite.addTest(common.unittest.makeSuite(String2TestCase)) + theSuite.addTest(common.unittest.makeSuite(StringComprTestCase)) + theSuite.addTest(common.unittest.makeSuite( + SizeOnDiskInMemoryPropertyTestCase)) + theSuite.addTest(common.unittest.makeSuite(OffsetStrideTestCase)) + theSuite.addTest(common.unittest.makeSuite(Fletcher32TestCase)) + theSuite.addTest(common.unittest.makeSuite(AllFiltersTestCase)) + theSuite.addTest(common.unittest.makeSuite(CloseCopyTestCase)) + theSuite.addTest(common.unittest.makeSuite(OpenCopyTestCase)) + theSuite.addTest(common.unittest.makeSuite(CopyIndex1TestCase)) + theSuite.addTest(common.unittest.makeSuite(CopyIndex2TestCase)) + theSuite.addTest(common.unittest.makeSuite(CopyIndex3TestCase)) + theSuite.addTest(common.unittest.makeSuite(CopyIndex4TestCase)) + theSuite.addTest(common.unittest.makeSuite(CopyIndex5TestCase)) + theSuite.addTest(common.unittest.makeSuite(TruncateOpenTestCase)) + theSuite.addTest(common.unittest.makeSuite(TruncateCloseTestCase)) + theSuite.addTest(common.unittest.makeSuite(ZeroSizedTestCase)) + theSuite.addTest(common.unittest.makeSuite(MDAtomNoReopen)) + theSuite.addTest(common.unittest.makeSuite(MDAtomReopen)) + theSuite.addTest(common.unittest.makeSuite(AccessClosedTestCase)) + theSuite.addTest(common.unittest.makeSuite(TestCreateEArrayArgs)) + if common.heavy: + theSuite.addTest(common.unittest.makeSuite(Slices3EArrayTestCase)) + theSuite.addTest(common.unittest.makeSuite(Slices4EArrayTestCase)) + theSuite.addTest(common.unittest.makeSuite(Ellipsis4EArrayTestCase)) + theSuite.addTest(common.unittest.makeSuite(Ellipsis5EArrayTestCase)) + theSuite.addTest(common.unittest.makeSuite(Ellipsis6EArrayTestCase)) + theSuite.addTest(common.unittest.makeSuite(Ellipsis7EArrayTestCase)) + theSuite.addTest(common.unittest.makeSuite(MD3WriteTestCase)) + theSuite.addTest(common.unittest.makeSuite(MD5WriteTestCase)) + theSuite.addTest(common.unittest.makeSuite(MD6WriteTestCase)) + theSuite.addTest(common.unittest.makeSuite(MD7WriteTestCase)) + theSuite.addTest(common.unittest.makeSuite(MD10WriteTestCase)) + theSuite.addTest(common.unittest.makeSuite(CopyIndex6TestCase)) + theSuite.addTest(common.unittest.makeSuite(CopyIndex7TestCase)) + theSuite.addTest(common.unittest.makeSuite(CopyIndex8TestCase)) + theSuite.addTest(common.unittest.makeSuite(CopyIndex9TestCase)) + theSuite.addTest(common.unittest.makeSuite(CopyIndex10TestCase)) + theSuite.addTest(common.unittest.makeSuite(CopyIndex11TestCase)) + theSuite.addTest(common.unittest.makeSuite(CopyIndex12TestCase)) + theSuite.addTest(common.unittest.makeSuite(Rows64bitsTestCase1)) + theSuite.addTest(common.unittest.makeSuite(Rows64bitsTestCase2)) + + return theSuite + + +if __name__ == '__main__': + common.parse_argv(sys.argv) + common.print_versions() + common.unittest.main(defaultTest='suite') diff --git a/tables/tests/test_enum.py b/tables/tests/test_enum.py new file mode 100644 index 0000000..dfdd749 --- /dev/null +++ b/tables/tests/test_enum.py @@ -0,0 +1,655 @@ +"""Test module for enumerated types under PyTables.""" + +import itertools +import operator + +import tables as tb +from tables.tests import common + + +class CreateColTestCase(common.PyTablesTestCase): + """Test creating enumerated column descriptions.""" + + def _createCol(self, enum, dflt, base='uint32', shape=()): + """Create and check an enumerated column description.""" + + enumcol = tb.EnumCol(enum, dflt, base=base, shape=shape) + sameEnum = tb.Enum(enum) + self.assertEqual(enumcol.type, 'enum') + self.assertEqual(enumcol.dtype.base.name, enumcol.base.type) + # To avoid 'LongInt' vs 'Int' issues + # self.assertEqual(enumcol.dflt, sameEnum[dflt]) + self.assertEqual(int(enumcol.dflt), int(sameEnum[dflt])) + self.assertEqual(enumcol.dtype.shape, shape) + self.assertEqual(enumcol.enum, sameEnum) + + def test00a_validFromEnum(self): + """Describing an enumerated column from an enumeration.""" + + colors = tb.Enum(['red', 'green', 'blue']) + self._createCol(colors, 'red') + + def test00b_validFromDict(self): + """Describing an enumerated column from a dictionary.""" + + colors = {'red': 4, 'green': 2, 'blue': 1} + self._createCol(colors, 'red') + + def test00c_validFromList(self): + """Describing an enumerated column from a list.""" + + colors = ['red', 'green', 'blue'] + self._createCol(colors, 'red') + + def test00d_invalidFromType(self): + """Describing an enumerated column from an invalid object.""" + + colors = 123 + self.assertRaises(TypeError, self._createCol, colors, 'red') + + def test01_invalidDflt(self): + """Describing an enumerated column with an invalid default object.""" + + colors = {'red': 4, 'green': 2, 'blue': 1} + self.assertRaises(KeyError, self._createCol, colors, 'black') + + def test02a_validDtypeBroader(self): + """Describing an enumerated column with a broader type.""" + + colors = {'red': 4, 'green': 2, 'blue': 1} + self._createCol(colors, 'red', 'int64') + + def test02b_invalidDtypeTooNarrow(self): + """Describing an enumerated column with a too narrow type.""" + + colors = ['e%d' % i for i in range(300)] + self.assertRaises(TypeError, self._createCol, colors, 'e0', 'uint8') + + def test03a_validShapeMD(self): + """Describing an enumerated column with multidimensional shape.""" + + colors = ['red', 'green', 'blue'] + self._createCol(colors, 'red', shape=(2,)) + + def test04a_validReprEnum(self): + """Checking the string representation of an enumeration.""" + + colors = tb.Enum(['red', 'green', 'blue']) + enumcol = tb.EnumCol(colors, 'red', base='uint32', shape=()) + + # needed due to "Hash randomization" (default on python 3.3) + template = ( + "EnumCol(enum=Enum({%s}), dflt='red', base=UInt32Atom(shape=(), " + "dflt=0), shape=(), pos=None)" + ) + permitations = [ + template % ', '.join(items) for items in itertools.permutations( + ("'blue': 2", "'green': 1", "'red': 0")) + ] + self.assertIn(repr(enumcol), permitations) + + def test99a_nonIntEnum(self): + """Describing an enumerated column of floats (not implemented).""" + + colors = {'red': 1.0} + self.assertRaises(NotImplementedError, self._createCol, colors, 'red', + base=tb.FloatAtom()) + + def test99b_nonIntDtype(self): + """Describing an enumerated column encoded as floats. + + (not implemented). + + """ + + colors = ['red', 'green', 'blue'] + self.assertRaises( + NotImplementedError, self._createCol, colors, 'red', 'float64') + + def test99b_nonScalarEnum(self): + """Describing an enumerated column of non-scalars (not implemented).""" + + colors = {'red': (1, 2, 3)} + self.assertRaises(NotImplementedError, self._createCol, colors, 'red', + base=tb.IntAtom(shape=3)) + + +class CreateAtomTestCase(common.PyTablesTestCase): + """Test creating enumerated atoms.""" + + def _createAtom(self, enum, dflt, base='uint32', shape=()): + """Create and check an enumerated atom.""" + + enumatom = tb.EnumAtom(enum, dflt, base=base, shape=shape) + sameEnum = tb.Enum(enum) + self.assertEqual(enumatom.type, 'enum') + self.assertEqual(enumatom.dtype.base.name, enumatom.base.type) + self.assertEqual(enumatom.shape, shape) + self.assertEqual(enumatom.enum, sameEnum) + + def test00a_validFromEnum(self): + """Describing an enumerated atom from an enumeration.""" + + colors = tb.Enum(['red', 'green', 'blue']) + self._createAtom(colors, 'red') + + def test00b_validFromDict(self): + """Describing an enumerated atom from a dictionary.""" + + colors = {'red': 4, 'green': 2, 'blue': 1} + self._createAtom(colors, 'red') + + def test00c_validFromList(self): + """Describing an enumerated atom from a list.""" + + colors = ['red', 'green', 'blue'] + self._createAtom(colors, 'red') + + def test00d_invalidFromType(self): + """Describing an enumerated atom from an invalid object.""" + + colors = 123 + self.assertRaises(TypeError, self._createAtom, colors, 'red') + + def test02a_validDtypeBroader(self): + """Describing an enumerated atom with a broader type.""" + + colors = {'red': 4, 'green': 2, 'blue': 1} + self._createAtom(colors, 'red', base='int64') + + def test02b_invalidDtypeTooNarrow(self): + """Describing an enumerated atom with a too narrow type.""" + + colors = ['e%d' % i for i in range(300)] + self.assertRaises(TypeError, self._createAtom, colors, 'red', 'uint8') + + def test03a_validShapeMD(self): + """Describing an enumerated atom with multidimensional shape.""" + + colors = ['red', 'green', 'blue'] + self._createAtom(colors, 'red', shape=(2,)) + + def test99a_nonIntEnum(self): + """Describing an enumerated atom of floats (not implemented).""" + + colors = {'red': 1.0} + self.assertRaises(NotImplementedError, self._createAtom, colors, 'red', + base=tb.FloatAtom()) + + def test99b_nonIntDtype(self): + """Describing an enumerated atom encoded as a float. + + (not implemented). + + """ + + colors = ['red', 'green', 'blue'] + self.assertRaises( + NotImplementedError, self._createAtom, colors, 'red', 'float64') + + def test99b_nonScalarEnum(self): + """Describing an enumerated atom of non-scalars (not implemented).""" + + colors = {'red': (1, 2, 3)} + self.assertRaises(NotImplementedError, self._createAtom, colors, 'red', + base=tb.IntAtom(shape=3)) + + +class EnumTableTestCase(common.TempFileMixin, common.PyTablesTestCase): + """Test tables with enumerated columns.""" + + enum = tb.Enum({'red': 4, 'green': 2, 'blue': 1, 'black': 0}) + defaultName = 'black' + valueInEnum = enum.red + valueOutOfEnum = 1234 + enumType = 'uint16' + + def _description(self, shape=()): + class TestDescription(tb.IsDescription): + rid = tb.IntCol(pos=0) + rcolor = tb.EnumCol( + self.enum, self.defaultName, + base=self.enumType, shape=shape, pos=1) + + return TestDescription + + def test00a_reopen(self): + """Reopening a file with tables using enumerated data.""" + + self.h5file.create_table( + '/', 'test', self._description(), title=self._getMethodName()) + + self._reopen() + + self.assertEqual( + self.h5file.root.test.get_enum('rcolor'), self.enum, + "Enumerated type was not restored correctly from disk.") + + def test00b_reopenMD(self): + """Reopening a file with tables using enumerated multi-dimensional + data.""" + + self.h5file.create_table( + '/', 'test', self._description((2,)), title=self._getMethodName()) + + self._reopen() + + self.assertEqual( + self.h5file.root.test.get_enum('rcolor'), self.enum, + "Enumerated type was not restored correctly from disk.") + + def test01_rowAppend(self): + """Appending enumerated values using ``row.append()``.""" + + tbl = self.h5file.create_table( + '/', 'test', self._description(), title=self._getMethodName()) + + appended = [ + (10, self.valueInEnum), + (20, self.valueOutOfEnum)] + + row = tbl.row + + row['rid'] = appended[0][0] + row['rcolor'] = appended[0][1] + row.append() + + row['rid'] = appended[1][0] + self.assertRaises( + ValueError, operator.setitem, row, 'rcolor', appended[1][1]) + + tbl.flush() + tbl.flavor = 'python' + read = tbl.read() + common.verbosePrint( + "* appended value: %s\n" + "* read value: %s\n" + % (appended[:-1], read)) + self.assertEqual( + appended[:-1], read, "Written and read values differ.") + + def test02_append(self): + """Appending enumerated values using ``table.append()``.""" + + tbl = self.h5file.create_table( + '/', 'test', self._description(), title=self._getMethodName()) + + appended = [ + (10, self.valueInEnum), + (20, self.valueOutOfEnum)] + + tbl.append(appended) + tbl.flush() + tbl.flavor = 'python' + read = tbl.read() + common.verbosePrint( + "* appended value: %s\n" + "* read value: %s\n" + % (appended, read)) + self.assertEqual(appended, read, "Written and read values differ.") + + def test03_setitem(self): + """Changing enumerated values using ``table.__setitem__()``.""" + + tbl = self.h5file.create_table( + '/', 'test', self._description(), title=self._getMethodName()) + + appended = [ + (10, self.valueInEnum), + (20, self.valueInEnum)] + tbl.append(appended) + + written = [ + (10, self.valueInEnum), + (20, self.valueOutOfEnum)] + tbl[:] = written + tbl.flavor = 'python' + read = tbl.read() + common.verbosePrint( + "* written value: %s\n" + "* read value: %s\n" + % (written, read)) + self.assertEqual(written, read, "Written and read values differ.") + + def test04_multidim(self): + """Appending multi-dimensional enumerated data.""" + + tbl = self.h5file.create_table( + '/', 'test', self._description((2,)), title=self._getMethodName()) + + appended = [ + (10, (self.valueInEnum, self.valueOutOfEnum)), + (20, (self.valueInEnum, self.valueOutOfEnum))] + + row = tbl.row + row['rid'] = appended[0][0] + self.assertRaises( + ValueError, operator.setitem, row, 'rcolor', appended[0][1]) + + tbl.append(appended) + tbl.flush() + tbl.flavor = 'python' + read = tbl.read() + for x_appended, x_read in zip(appended, read): + self.assertEqual(x_appended[0], x_read[0], + "Written and read values differ.") + self.assertEqual(x_appended[1][0], x_read[1][0], + "Written and read values differ.") + self.assertEqual(x_appended[1][1], x_read[1][1], + "Written and read values differ.") + + def test05_where(self): + """Searching enumerated data.""" + + tbl = self.h5file.create_table( + '/', 'test', self._description(), title=self._getMethodName()) + + appended = [ + (10, self.valueInEnum), + (20, self.valueInEnum), + (30, self.valueOutOfEnum)] + tbl.append(appended) + tbl.flush() + + searched = [ + (row['rid'], row['rcolor']) + for row in tbl.where('rcolor == v', {'v': self.valueInEnum})] + common.verbosePrint( + "* ``valueInEnum``: %s\n" + "* ``rcolor`` column: ``%s``\n" + "* ``searched``: %s\n" + "* Should look like: %s\n" + % (self.valueInEnum, tbl.cols.rcolor, searched, appended[:-1])) + self.assertEqual( + searched, appended[:-1], "Search returned incorrect results.") + + +class EnumEArrayTestCase(common.TempFileMixin, common.PyTablesTestCase): + """Test extendable arrays of enumerated values.""" + + enum = tb.Enum({'red': 4, 'green': 2, 'blue': 1, 'black': 0}) + valueInEnum = enum.red + valueOutOfEnum = 1234 + enumType = 'uint16' + + def _atom(self, shape=()): + return tb.EnumAtom(self.enum, 'red', base=self.enumType, shape=shape) + + def test00a_reopen(self): + """Reopening a file with extendable arrays using enumerated data.""" + + self.h5file.create_earray( + '/', 'test', self._atom(), shape=(0,), + title=self._getMethodName()) + self.h5file.root.test.flavor = 'python' + + self._reopen() + + self.assertEqual( + self.h5file.root.test.get_enum(), self.enum, + "Enumerated type was not restored correctly from disk.") + + def test00b_reopenMD(self): + """Reopening a file with extendable arrays using enumerated + multi-dimensional data.""" + + self.h5file.create_earray( + '/', 'test', self._atom(), shape=(0, 2), + title=self._getMethodName()) + self.h5file.root.test.flavor = 'python' + + self._reopen() + + self.assertEqual( + self.h5file.root.test.get_enum(), self.enum, + "Enumerated type was not restored correctly from disk.") + + def test_enum_default_persistence_red(self): + dflt = 'red' + atom = tb.EnumAtom(self.enum, dflt, base=self.enumType, shape=()) + + self.h5file.create_earray('/', 'test', atom, shape=(0,), + title=self._getMethodName()) + self._reopen() + + self.assertEqual( + self.h5file.root.test.get_enum(), self.enum, + "Enumerated type was not restored correctly from disk.") + + self.assertEqual( + self.h5file.root.test.atom.dflt, self.enum[dflt], + "The default value of enumerated type was not restored correctly " + "from disk.") + + def test_enum_default_persistence_green(self): + dflt = 'green' + atom = tb.EnumAtom(self.enum, dflt, base=self.enumType, shape=()) + + self.h5file.create_earray('/', 'test', atom, shape=(0,), + title=self._getMethodName()) + self._reopen() + + self.assertEqual( + self.h5file.root.test.get_enum(), self.enum, + "Enumerated type was not restored correctly from disk.") + + self.assertEqual( + self.h5file.root.test.atom.dflt, self.enum[dflt], + "The default value of enumerated type was not restored correctly " + "from disk.") + + def test_enum_default_persistence_blue(self): + dflt = 'blue' + atom = tb.EnumAtom(self.enum, dflt, base=self.enumType, shape=()) + + self.h5file.create_earray('/', 'test', atom, shape=(0,), + title=self._getMethodName()) + self._reopen() + + self.assertEqual( + self.h5file.root.test.get_enum(), self.enum, + "Enumerated type was not restored correctly from disk.") + + self.assertEqual( + self.h5file.root.test.atom.dflt, self.enum[dflt], + "The default value of enumerated type was not restored correctly " + "from disk.") + + def test_enum_default_persistence_black(self): + dflt = 'black' + atom = tb.EnumAtom(self.enum, dflt, base=self.enumType, shape=()) + + self.h5file.create_earray('/', 'test', atom, shape=(0,), + title=self._getMethodName()) + self._reopen() + + self.assertEqual( + self.h5file.root.test.get_enum(), self.enum, + "Enumerated type was not restored correctly from disk.") + + self.assertEqual( + self.h5file.root.test.atom.dflt, self.enum[dflt], + "The default value of enumerated type was not restored correctly " + "from disk.") + + def test01_append(self): + """Appending scalar elements of enumerated values.""" + + earr = self.h5file.create_earray( + '/', 'test', self._atom(), shape=(0,), + title=self._getMethodName()) + earr.flavor = 'python' + + appended = [self.valueInEnum, self.valueOutOfEnum] + + earr.append(appended) + earr.flush() + read = earr.read() + self.assertEqual(appended, read, "Written and read values differ.") + + def test02_appendMD(self): + """Appending multi-dimensional elements of enumerated values.""" + + earr = self.h5file.create_earray( + '/', 'test', self._atom(), shape=(0, 2), + title=self._getMethodName()) + earr.flavor = 'python' + + appended = [ + [self.valueInEnum, self.valueOutOfEnum], + [self.valueInEnum, self.valueOutOfEnum]] + + earr.append(appended) + earr.flush() + read = earr.read() + self.assertEqual(appended, read, "Written and read values differ.") + + def test03_setitem(self): + """Changing enumerated values using ``earray.__setitem__()``.""" + + earr = self.h5file.create_earray( + '/', 'test', self._atom(), shape=(0,), + title=self._getMethodName()) + earr.flavor = 'python' + + appended = (self.valueInEnum, self.valueInEnum) + earr.append(appended) + + written = [self.valueInEnum, self.valueOutOfEnum] + earr[:] = written + read = earr.read() + self.assertEqual(written, read, "Written and read values differ.") + + +class EnumVLArrayTestCase(common.TempFileMixin, common.PyTablesTestCase): + """Test variable-length arrays of enumerated values.""" + + enum = tb.Enum({'red': 4, 'green': 2, 'blue': 1, 'black': 0}) + valueInEnum = enum.red + valueOutOfEnum = 1234 + enumType = 'uint16' + + def _atom(self, shape=()): + return tb.EnumAtom(self.enum, 'red', base=self.enumType, shape=shape) + + def test00a_reopen(self): + """Reopening a file with variable-length arrays using + enumerated data.""" + + self.h5file.create_vlarray( + '/', 'test', self._atom(), + title=self._getMethodName()) + self.h5file.root.test.flavor = 'python' + + self._reopen() + + self.assertEqual( + self.h5file.root.test.get_enum(), self.enum, + "Enumerated type was not restored correctly from disk.") + + def test00b_reopenMD(self): + """Reopening a file with variable-length arrays using enumerated + multi-dimensional data.""" + + self.h5file.create_vlarray( + '/', 'test', self._atom((2,)), + title=self._getMethodName()) + self.h5file.root.test.flavor = 'python' + + self._reopen() + + self.assertEqual( + self.h5file.root.test.get_enum(), self.enum, + "Enumerated type was not restored correctly from disk.") + + def test01_append(self): + """Appending scalar elements of enumerated values.""" + + vlarr = self.h5file.create_vlarray( + '/', 'test', self._atom(), + title=self._getMethodName()) + vlarr.flavor = 'python' + + appended = [ + [self.valueInEnum, ], + [self.valueInEnum, self.valueOutOfEnum]] + + vlarr.append(appended[0]) + vlarr.append(appended[1]) + vlarr.flush() + read = vlarr.read() + common.verbosePrint( + "* appended value: %s\n" + "* read value: %s\n" + % (appended, read)) + self.assertEqual(appended, read, "Written and read values differ.") + + def test02_appendMD(self): + """Appending multi-dimensional elements of enumerated values.""" + + vlarr = self.h5file.create_vlarray( + '/', 'test', self._atom((2,)), + title=self._getMethodName()) + vlarr.flavor = 'python' + + appended = [ + [[self.valueInEnum, self.valueInEnum], ], + [[self.valueInEnum, self.valueOutOfEnum], + [self.valueInEnum, self.valueInEnum]]] + + vlarr.append(appended[0]) + vlarr.append(appended[1]) + vlarr.flush() + read = vlarr.read() + common.verbosePrint( + "* appended value: %s\n" + "* read value: %s\n" + % (appended, read)) + self.assertEqual(appended, read, "Written and read values differ.") + + def test03_setitem(self): + """Changing enumerated values using ``vlarray.__setitem__()``.""" + + vlarr = self.h5file.create_vlarray( + '/', 'test', self._atom(), + title=self._getMethodName()) + vlarr.flavor = 'python' + + appended = (self.valueInEnum, self.valueInEnum) + vlarr.append(appended) + + written = [self.valueInEnum, self.valueOutOfEnum] + vlarr[0] = written + read = vlarr.read() + common.verbosePrint( + "* written value: %s\n" + "* read value: %s\n" + % (written, read)) + self.assertEqual(written, read[0], "Written and read values differ.") + + +def suite(): + """Return a test suite consisting of all the test cases in the module.""" + + # These two are for including Enum's doctests here. + import doctest + theSuite = common.unittest.TestSuite() + niter = 1 + + # theSuite.addTest(unittest.makeSuite(EnumTableTestCase)) + for i in range(niter): + theSuite.addTest(doctest.DocTestSuite(tb.misc.enum)) + theSuite.addTest(common.unittest.makeSuite(CreateColTestCase)) + theSuite.addTest(common.unittest.makeSuite(CreateAtomTestCase)) + theSuite.addTest(common.unittest.makeSuite(EnumTableTestCase)) + theSuite.addTest(common.unittest.makeSuite(EnumEArrayTestCase)) + theSuite.addTest(common.unittest.makeSuite(EnumVLArrayTestCase)) + + return theSuite + + +if __name__ == '__main__': + import sys + common.parse_argv(sys.argv) + common.print_versions() + common.unittest.main(defaultTest='suite') diff --git a/tables/tests/test_expression.py b/tables/tests/test_expression.py new file mode 100644 index 0000000..502ae42 --- /dev/null +++ b/tables/tests/test_expression.py @@ -0,0 +1,1561 @@ +"""Test module for evaluating expressions under PyTables.""" + + +import numpy as np + +import tables as tb +from tables.tests import common + +# An example of record + + +class Record(tb.IsDescription): + colInt32 = tb.Int32Col() + colInt64 = tb.Int64Col() + colFloat32 = tb.Float32Col() + colFloat64 = tb.Float64Col() + colComplex = tb.ComplexCol(itemsize=16) + + +# Helper functions +def get_sliced_vars(npvars, start, stop, step): + npvars_ = {} + for name, var in npvars.items(): + if hasattr(var, "__len__"): + npvars_[name] = var[start:stop:step] + else: + npvars_[name] = var + return npvars_ + + +def get_sliced_vars2(npvars, start, stop, step, shape, maindim): + npvars_ = {} + slices = [slice(None) for dim in shape] + slices[maindim] = slice(start, stop, step) + for name, var in npvars.items(): + npvars_[name] = var.__getitem__(tuple(slices)) + return npvars_ + + +# Basic tests +class ExprTestCase(common.TempFileMixin, common.PyTablesTestCase): + + # The shape for the variables in expressions + shape = (10, 20) + + def setUp(self): + super().setUp() + + # The expression + self.expr = "2 * a*b + c" + # Define the NumPy variables to be used in expression + N = np.prod(self.shape) + self.a = a = np.arange(0, N, dtype='int32').reshape(self.shape) + self.b = b = np.arange(N, 2 * N, dtype='int64').reshape(self.shape) + self.c = c = np.arange(2 * N, 3*N, dtype='int32').reshape(self.shape) + self.r1 = r1 = np.empty(N, dtype='int64').reshape(self.shape) + self.npvars = {"a": a, "b": b, "c": c, } + # Define other variables, if needed + root = self.h5file.root + if self.kind == "Array": + self.a = self.h5file.create_array(root, "a", a) + self.b = self.h5file.create_array(root, "b", b) + self.c = self.h5file.create_array(root, "c", c) + self.r1 = self.h5file.create_array(root, "r1", r1) + elif self.kind == "CArray": + self.a = self.h5file.create_carray( + root, "a", atom=tb.Atom.from_dtype(a.dtype), + shape=self.shape) + self.b = self.h5file.create_carray( + root, "b", atom=tb.Atom.from_dtype(b.dtype), + shape=self.shape) + self.c = self.h5file.create_carray( + root, "c", atom=tb.Atom.from_dtype(c.dtype), + shape=self.shape) + self.r1 = self.h5file.create_carray( + root, "r1", atom=tb.Atom.from_dtype(r1.dtype), + shape=self.shape) + self.a[:] = a + self.b[:] = b + self.c[:] = c + elif self.kind == "EArray": + shape = list(self.shape) + shape[0] = 0 + self.a = self.h5file.create_earray( + root, "a", atom=tb.Atom.from_dtype(a.dtype), shape=shape) + self.b = self.h5file.create_earray( + root, "b", atom=tb.Atom.from_dtype(b.dtype), shape=shape) + self.c = self.h5file.create_earray( + root, "c", atom=tb.Atom.from_dtype(c.dtype), shape=shape) + self.r1 = self.h5file.create_earray( + root, "r1", atom=tb.Atom.from_dtype(r1.dtype), shape=shape) + self.a.append(a) + self.b.append(b) + self.c.append(c) + self.r1.append(r1) # Fill with uninitialized values + elif self.kind == "Column": + ra = np.rec.fromarrays( + [a, b, c, r1], + dtype="%si4,%si8,%si4,%si8" % ((self.shape[1:],)*4)) + t = self.h5file.create_table(root, "t", ra) + self.a = t.cols.f0 + self.b = t.cols.f1 + self.c = t.cols.f2 + self.d = t.cols.f3 + self.vars = {"a": self.a, "b": self.b, "c": self.c, } + + def test00_simple(self): + """Checking that expression is correctly evaluated.""" + + expr = tb.Expr(self.expr, self.vars) + r1 = expr.eval() + r2 = eval(self.expr, self.npvars) + if common.verbose: + print("Computed expression:", repr(r1)) + print("Should look like:", repr(r2)) + self.assertTrue(common.areArraysEqual(r1, r2), + "Evaluate is returning a wrong value.") + + def test01_out(self): + """Checking that expression is correctly evaluated (`out` param)""" + + expr = tb.Expr(self.expr, self.vars) + expr.set_output(self.r1) + r1 = expr.eval() + if self.kind != "NumPy": + r1 = r1[:] + r2 = eval(self.expr, self.npvars) + if common.verbose: + print("Computed expression:", repr(r1)) + print("Should look like:", repr(r2)) + self.assertTrue(common.areArraysEqual(r1, r2), + "Evaluate is returning a wrong value.") + + def test02_out(self): + """Checking that expression is correctly evaluated when slice is + outside of data samples (`out` param)""" + expr = tb.Expr(self.expr, self.vars) + # maybe it's better to use the leading dimemsion instead? + maxshape = max(self.shape) + start, stop, step = (maxshape + 1, maxshape + 2, None) + expr.set_inputs_range(start, stop, step) + r1 = expr.eval() + # create an empty array with the same dtype and shape + zeros = np.zeros(shape=self.shape, dtype=r1.dtype) + r2 = zeros[start:stop:step] + self.assertListEqual(r1.tolist(), r2.tolist()) + if common.verbose: + print("Computed expression:", repr(r1)) + print("Should look like:", repr(r2)) + self.assertTrue(common.areArraysEqual(r1, r2), + "Evaluate is returning a wrong value.") + + +class ExprNumPy(ExprTestCase): + kind = "NumPy" + + +class ExprArray(ExprTestCase): + kind = "Array" + + +class ExprCArray(ExprTestCase): + kind = "CArray" + + +class ExprEArray(ExprTestCase): + kind = "EArray" + + +class ExprColumn(ExprTestCase): + kind = "Column" + + +# Test for mixed containers +class MixedContainersTestCase(common.TempFileMixin, common.PyTablesTestCase): + + def setUp(self): + super().setUp() + + # The expression + self.expr = "2 * a*b + c**2+d**2+e-f+g" + + # Create a directory in file for outputs + root = self.h5file.root + outs = self.h5file.create_group(root, "outs") + + # Define the NumPy variables to be used in expression + N = np.prod(self.shape) + + # Initial values for variables + a = np.arange(0, N, dtype='int32').reshape(self.shape) + b = np.arange(N, 2 * N, dtype='int64').reshape(self.shape) + c = np.arange(2 * N, 3*N, dtype='int32').reshape(self.shape) + d = np.arange(3 * N, 4*N, dtype='int32').reshape(self.shape) + e = np.arange(4 * N, 5*N, dtype='int32').reshape(self.shape) + self.f = f = int(3) # a regular python type + self.g = g = np.int16(2) # a NumPy scalar type + + # Original values + self.npvars = {"a": a, "b": b, "c": c, "d": d, "e": e, "f": f, "g": g} + rnda = b.copy() + + # ndarray input and output + self.a = a + self.rnda = rnda + + # Array input and output + self.b = self.h5file.create_array(root, "b", b) + self.rarr = self.b.copy(outs) + + # CArray input and output + self.c = self.h5file.create_carray( + root, "c", atom=tb.Atom.from_dtype(c.dtype), shape=self.shape) + self.c[:] = c + self.rcarr = self.c.copy(outs) + + # EArray input and output + eshape = list(self.shape) + eshape[0] = 0 + self.d = self.h5file.create_earray( + root, "d", atom=tb.Atom.from_dtype(d.dtype), shape=eshape) + self.d.append(d) + self.rearr = self.d.copy(outs) + + # Column input and output + rtype = {} + colshape = self.shape[1:] + for i, col in enumerate((a, b, c, d, e, rnda)): + rtype['f%d' % i] = tb.Col.from_sctype(col.dtype.type, colshape) + t = self.h5file.create_table(root, "t", rtype) + nrows = self.shape[0] + row = t.row + for nrow in range(nrows): + for i, col in enumerate((a, b, c, d, e, rnda)): + row['f%d' % i] = col[nrow] + row.append() + t.flush() + self.e = t.cols.f4 + self.rcol = t.cols.f5 + # Input vars + self.vars = {"a": self.a, "b": self.b, "c": self.c, "d": self.d, + "e": self.e, "f": self.f, "g": self.g, } + + def test00a_simple(self): + """Checking expressions with mixed objects.""" + + expr = tb.Expr(self.expr, self.vars) + r1 = expr.eval() + r2 = eval(self.expr, self.npvars) + if common.verbose: + print("Computed expression:", repr(r1), r1.dtype) + print("Should look like:", repr(r2), r2.dtype) + + self.assertTrue(common.areArraysEqual(r1, r2), + "Evaluate is returning a wrong value.") + + def test00b_simple_scalars(self): + """Checking that scalars in expression evaluate correctly.""" + + expr_str = "2 * f + g" + expr = tb.Expr(expr_str, self.vars) + r1 = expr.eval() + r2 = eval(expr_str, self.npvars) + if common.verbose: + print("Computed expression:", repr(r1), r1.dtype) + print("Should look like:", repr(r2), r2.dtype) + self.assertTrue( + r1.shape == r2.shape and r1.dtype == r2.dtype and r1 == r2, + "Evaluate is returning a wrong value.") + + def test01a_out(self): + """Checking expressions with mixed objects (`out` param)""" + + expr = tb.Expr(self.expr, self.vars) + for r1 in self.rnda, self.rarr, self.rcarr, self.rearr, self.rcol: + if common.verbose: + print("Checking output container:", type(r1)) + expr.set_output(r1) + r1 = expr.eval() + if not isinstance(r1, type(self.rnda)): + r1 = r1[:] + r2 = eval(self.expr, self.npvars) + if common.verbose: + print("Computed expression:", repr(r1), r1.dtype) + print("Should look like:", repr(r2), r2.dtype) + self.assertTrue(common.areArraysEqual(r1, r2), + "Evaluate is returning a wrong value.") + + def test01b_out_scalars(self): + """Checking expressions with mixed objects (`out` param, scalars)""" + + if len(self.shape) > 1: + # This test is only meant for undimensional outputs + return + expr_str = "2 * f + g" + expr = tb.Expr(expr_str, self.vars) + for r1 in self.rnda, self.rarr, self.rcarr, self.rearr, self.rcol: + if common.verbose: + print("Checking output container:", type(r1)) + expr.set_output(r1) + r1 = expr.eval() + r1 = r1[()] # convert a 0-dim array into a scalar + r2 = eval(expr_str, self.npvars) + if common.verbose: + print("Computed expression:", repr(r1), r1.dtype) + print("Should look like:", repr(r2), r2.dtype) + self.assertTrue(common.areArraysEqual(r1, r2), + "Evaluate is returning a wrong value.") + + def test02a_sss(self): + """Checking mixed objects and start, stop, step (I)""" + + start, stop, step = (self.start, self.stop, 1) + expr = tb.Expr(self.expr, self.vars) + expr.set_inputs_range(start, stop, step) + r1 = expr.eval() + npvars = get_sliced_vars(self.npvars, start, stop, step) + r2 = eval(self.expr, npvars) + if common.verbose: + print("Computed expression:", repr(r1), r1.dtype) + print("Should look like:", repr(r2), r2.dtype) + self.assertTrue(common.areArraysEqual(r1, r2), + "Evaluate is returning a wrong value.") + + def test02b_sss(self): + """Checking mixed objects and start, stop, step (II)""" + + start, stop, step = (0, self.shape[0], self.step) + expr = tb.Expr(self.expr, self.vars) + expr.set_inputs_range(start, stop, step) + r1 = expr.eval() + npvars = get_sliced_vars(self.npvars, start, stop, step) + r2 = eval(self.expr, npvars) + if common.verbose: + print("Computed expression:", repr(r1), r1.dtype) + print("Should look like:", repr(r2), r2.dtype) + self.assertTrue(common.areArraysEqual(r1, r2), + "Evaluate is returning a wrong value.") + + def test02c_sss(self): + """Checking mixed objects and start, stop, step (III)""" + + start, stop, step = (self.start, self.stop, self.step) + expr = tb.Expr(self.expr, self.vars) + expr.set_inputs_range(start, stop, step) + r1 = expr.eval() + npvars = get_sliced_vars(self.npvars, start, stop, step) + r2 = eval(self.expr, npvars) + if common.verbose: + print("Computed expression:", repr(r1), r1.dtype) + print("Should look like:", repr(r2), r2.dtype) + self.assertTrue(common.areArraysEqual(r1, r2), + "Evaluate is returning a wrong value.") + + def test03_sss(self): + """Checking start, stop, step as numpy.int64.""" + + start, stop, step = [np.int64(i) for i in + (self.start, self.stop, self.step)] + expr = tb.Expr(self.expr, self.vars) + expr.set_inputs_range(start, stop, step) + r1 = expr.eval() + npvars = get_sliced_vars(self.npvars, start, stop, step) + r2 = eval(self.expr, npvars) + if common.verbose: + print("Computed expression:", repr(r1), r1.dtype) + print("Should look like:", repr(r2), r2.dtype) + self.assertTrue(common.areArraysEqual(r1, r2), + "Evaluate is returning a wrong value.") + + +class MixedContainers0(MixedContainersTestCase): + shape = (1,) + start, stop, step = (0, 1, 1) + + +class MixedContainers1(MixedContainersTestCase): + shape = (10,) + start, stop, step = (3, 6, 2) + + +class MixedContainers2(MixedContainersTestCase): + shape = (10, 5) + start, stop, step = (2, 9, 3) + + +class MixedContainers3(MixedContainersTestCase): + shape = (10, 3, 2) + start, stop, step = (2, -1, 1) + + +# Test for unaligned objects +class UnalignedObject(common.PyTablesTestCase): + + def test00_simple(self): + """Checking expressions with unaligned objects.""" + + # Build unaligned arrays + a0 = np.empty(10, dtype="int8") + a1 = np.arange(10, dtype="int32") + a2 = a1.copy() + a3 = a2.copy() + ra = np.rec.fromarrays([a0, a1, a2, a3]) + # The inputs + a = ra['f1'] + b = ra['f2'] + self.assertEqual(a.flags.aligned, False) + self.assertEqual(b.flags.aligned, False) + # The expression + sexpr = "2 * a + b" + expr = tb.Expr(sexpr) + r1 = expr.eval() + r2 = eval(sexpr) + if common.verbose: + print("Computed expression:", repr(r1), r1.dtype) + print("Should look like:", repr(r2), r2.dtype) + self.assertTrue(common.areArraysEqual(r1, r2), + "Evaluate is returning a wrong value.") + + def test01_md(self): + """Checking expressions with unaligned objects (MD version)""" + + # Build unaligned arrays + a0 = np.empty((10, 4), dtype="int8") + a1 = np.arange(10 * 4, dtype="int32").reshape(10, 4) + a2 = a1.copy() + a3 = a2.copy() + ra = np.rec.fromarrays([a0, a1, a2, a3]) + # The inputs + a = ra['f1'] + b = ra['f2'] + self.assertEqual(a.flags.aligned, False) + self.assertEqual(b.flags.aligned, False) + # The expression + sexpr = "2 * a + b" + expr = tb.Expr(sexpr) + r1 = expr.eval() + r2 = eval(sexpr) + if common.verbose: + print("Computed expression:", repr(r1), r1.dtype) + print("Should look like:", repr(r2), r2.dtype) + self.assertTrue(common.areArraysEqual(r1, r2), + "Evaluate is returning a wrong value.") + + +# Test for non-contiguous objects +class NonContiguousObject(common.PyTablesTestCase): + + def test00_simple(self): + """Checking expressions with non-contiguous objects""" + + # Build non-contiguous arrays as inputs + a = np.arange(10, dtype="int32") + b = a[::2] + a = b * 2 + self.assertEqual(b.flags.contiguous, False) + self.assertEqual(b.flags.aligned, True) + # The expression + sexpr = "2 * a + b" + expr = tb.Expr(sexpr) + r1 = expr.eval() + r2 = eval(sexpr) + if common.verbose: + print("Computed expression:", repr(r1), r1.dtype) + print("Should look like:", repr(r2), r2.dtype) + self.assertTrue(common.areArraysEqual(r1, r2), + "Evaluate is returning a wrong value.") + + def test01a_md(self): + """Checking expressions with non-contiguous objects (MD version, I)""" + + # Build non-contiguous arrays + a = np.arange(10 * 4, dtype="int32").reshape(10, 4) + b = a[::2] + a = b * 2 + self.assertEqual(b.flags.contiguous, False) + self.assertEqual(b.flags.aligned, True) + # The expression + sexpr = "2 * a + b" + expr = tb.Expr(sexpr) + r1 = expr.eval() + r2 = eval(sexpr) + if common.verbose: + print("Computed expression:", repr(r1), r1.dtype) + print("Should look like:", repr(r2), r2.dtype) + self.assertTrue(common.areArraysEqual(r1, r2), + "Evaluate is returning a wrong value.") + + def test01b_md(self): + """Checking expressions with non-contiguous objects (MD version, II)""" + + # Build non-contiguous arrays + a = np.arange(10 * 4, dtype="int32").reshape(10, 4) + b = a[:, ::2] + a = b * 2 + self.assertEqual(b.flags.contiguous, False) + self.assertEqual(b.flags.aligned, True) + # The expression + sexpr = "2 * a + b" + expr = tb.Expr(sexpr) + r1 = expr.eval() + r2 = eval(sexpr) + if common.verbose: + print("Computed expression:", repr(r1), r1.dtype) + print("Should look like:", repr(r2), r2.dtype) + self.assertTrue(common.areArraysEqual(r1, r2), + "Evaluate is returning a wrong value.") + + +# Test for errors +class ExprError(common.TempFileMixin, common.PyTablesTestCase): + + # The shape for the variables in expressions + shape = (10,) + + def setUp(self): + super().setUp() + + # Define the NumPy variables to be used in expression + N = np.prod(self.shape) + self.a = np.arange(N, dtype='int32').reshape(self.shape) + self.b = np.arange(N, dtype='int64').reshape(self.shape) + self.c = np.arange(N, dtype='int32').reshape(self.shape) + self.r1 = np.empty(N, dtype='int64').reshape(self.shape) + + def _test00_shape(self): + """Checking that inconsistent shapes are detected.""" + + self.b = self.b.reshape(self.shape+(1,)) + expr = "a * b + c" + vars_ = {"a": self.a, "b": self.b, "c": self.c, } + expr = tb.Expr(expr, vars_) + self.assertRaises(ValueError, expr.eval) + + def test02_uint64(self): + """Checking that uint64 arrays in expression are detected.""" + + self.b = self.b.view('uint64') + expr = "a * b + c" + vars_ = {"a": self.a, "b": self.b, "c": self.c, } + self.assertRaises(NotImplementedError, tb.Expr, expr, vars_) + + def test03_table(self): + """Checking that tables in expression are detected.""" + + class Rec(tb.IsDescription): + col1 = tb.Int32Col() + col2 = tb.Int64Col() + + t = self.h5file.create_table("/", "a", Rec) + expr = "a * b + c" + vars_ = {"a": t, "b": self.b, "c": self.c, } + self.assertRaises(TypeError, tb.Expr, expr, vars_) + + def test04_nestedcols(self): + """Checking that nested cols in expression are detected.""" + + class Nested(tb.IsDescription): + col1 = tb.Int32Col() + + class col2(tb.IsDescription): + col3 = tb.Int64Col() + + t = self.h5file.create_table("/", "a", Nested) + expr = "a * b + c" + # The next non-nested column should work + a = t.cols.col2.col3 + vars_ = {"a": a, "b": self.b, "c": self.c, } + expr = tb.Expr(expr, vars_) + r1 = expr.eval() + self.assertIsNotNone(r1) + # But a nested column should not + a = t.cols.col2 + vars_ = {"a": a, "b": self.b, "c": self.c, } + self.assertRaises(TypeError, tb.Expr, expr, vars_) + + def test05_vlarray(self): + """Checking that VLArrays in expression are detected.""" + + vla = self.h5file.create_vlarray("/", "a", tb.Int32Col()) + expr = "a * b + c" + vars_ = {"a": vla, "b": self.b, "c": self.c, } + self.assertRaises(TypeError, tb.Expr, expr, vars_) + + +# Test for broadcasting arrays +class BroadcastTestCase(common.TempFileMixin, common.PyTablesTestCase): + + def test00_simple(self): + """Checking broadcast in expression.""" + + shapes = (self.shape1, self.shape2, self.shape3) + # Build arrays with different shapes as inputs + a = np.arange(np.prod(shapes[0]), dtype="i4").reshape(shapes[0]) + b = np.arange(np.prod(shapes[1]), dtype="i4").reshape(shapes[1]) + c = np.arange(np.prod(shapes[2]), dtype="i4").reshape(shapes[2]) + root = self.h5file.root + if a.shape[0] > 0: + a1 = self.h5file.create_array(root, 'a1', a) + else: + a1 = self.h5file.create_earray( + root, 'a1', atom=tb.Int32Col(), shape=a.shape) + self.assertIsNotNone(a1) + b1 = self.h5file.create_array(root, 'b1', b) + self.assertIsNotNone(b1) + c1 = self.h5file.create_array(root, 'c1', c) + self.assertIsNotNone(c1) + # The expression + expr = tb.Expr("2 * a1 + b1-c1") + r1 = expr.eval() + r2 = eval("2 * a + b-c") + if common.verbose: + print("Tested shapes:", self.shape1, self.shape2, self.shape3) + print("Computed expression:", repr(r1), r1.dtype) + print("Should look like:", repr(r2), r2.dtype) + self.assertTrue(common.areArraysEqual(r1, r2), + "Evaluate is returning a wrong value.") + + +class Broadcast0(BroadcastTestCase): + shape1 = (0, 3, 4) + shape2 = (3, 4) + shape3 = (4,) + + +class Broadcast1(BroadcastTestCase): + shape1 = (2, 3, 4) + shape2 = (3, 4) + shape3 = (4,) + + +class Broadcast2(BroadcastTestCase): + shape1 = (3, 4,) + shape2 = (3, 4) + shape3 = (4,) + + +class Broadcast3(BroadcastTestCase): + shape1 = (4,) + shape2 = (3, 4) + shape3 = (4,) + + +class Broadcast4(BroadcastTestCase): + shape1 = (1,) + shape2 = (3, 4) + shape3 = (4,) + + +class Broadcast5(BroadcastTestCase): + shape1 = (1,) + shape2 = (3, 1) + shape3 = (4,) + + +# Test for different length inputs +class DiffLengthTestCase(common.TempFileMixin, common.PyTablesTestCase): + + def test00_simple(self): + """Checking different length inputs in expression.""" + + shapes = (list(self.shape1), list(self.shape2), list(self.shape3)) + # Build arrays with different shapes as inputs + a = np.arange(np.prod(shapes[0]), dtype="i4").reshape(shapes[0]) + b = np.arange(np.prod(shapes[1]), dtype="i4").reshape(shapes[1]) + c = np.arange(np.prod(shapes[2]), dtype="i4").reshape(shapes[2]) + # The expression + expr = tb.Expr("2 * a + b-c") + r1 = expr.eval() + # Compute the minimum length for shapes + maxdim = max([len(shape) for shape in shapes]) + minlen = min([shape[0] for i, shape in enumerate(shapes) + if len(shape) == maxdim]) + for i, shape in enumerate(shapes): + if len(shape) == maxdim: + shape[0] = minlen + # Build arrays with the new shapes as inputs + a = np.arange(np.prod(shapes[0]), dtype="i4").reshape(shapes[0]) + self.assertIsNotNone(a) + b = np.arange(np.prod(shapes[1]), dtype="i4").reshape(shapes[1]) + self.assertIsNotNone(b) + c = np.arange(np.prod(shapes[2]), dtype="i4").reshape(shapes[2]) + self.assertIsNotNone(c) + r2 = eval("2 * a + b-c") + if common.verbose: + print("Tested shapes:", self.shape1, self.shape2, self.shape3) + print("Computed expression:", repr(r1), r1.dtype) + print("Should look like:", repr(r2), r2.dtype) + self.assertTrue(common.areArraysEqual(r1, r2), + "Evaluate is returning a wrong value.") + + +class DiffLength0(DiffLengthTestCase): + shape1 = (0,) + shape2 = (10,) + shape3 = (20,) + + +class DiffLength1(DiffLengthTestCase): + shape1 = (3,) + shape2 = (10,) + shape3 = (20,) + + +class DiffLength2(DiffLengthTestCase): + shape1 = (3, 4) + shape2 = (2, 3, 4) + shape3 = (4, 3, 4) + + +class DiffLength3(DiffLengthTestCase): + shape1 = (1, 3, 4) + shape2 = (2, 3, 4) + shape3 = (4, 3, 4) + + +class DiffLength4(DiffLengthTestCase): + shape1 = (0, 3, 4) + shape2 = (2, 3, 4) + shape3 = (4, 3, 4) + + +# Test for different type inputs +class TypesTestCase(common.TempFileMixin, common.PyTablesTestCase): + + def test00_bool(self): + """Checking booleans in expression.""" + + # Build arrays with different shapes as inputs + a = np.array([True, False, True]) + b = np.array([False, True, False]) + root = self.h5file.root + a1 = self.h5file.create_array(root, 'a1', a) + self.assertIsNotNone(a1) + b1 = self.h5file.create_array(root, 'b1', b) + self.assertIsNotNone(b1) + expr = tb.Expr("a | b") + r1 = expr.eval() + r2 = eval("a | b") + if common.verbose: + print("Computed expression:", repr(r1), r1.dtype) + print("Should look like:", repr(r2), r2.dtype) + self.assertTrue(common.areArraysEqual(r1, r2), + "Evaluate is returning a wrong value.") + + def test01_shortint(self): + """Checking int8,uint8,int16,uint16 and int32 in expression.""" + + for dtype in 'int8', 'uint8', 'int16', 'uint16', 'int32': + if common.verbose: + print("Checking type:", dtype) + # Build arrays with different shapes as inputs + a = np.array([1, 2, 3], dtype) + b = np.array([3, 4, 5], dtype) + root = self.h5file.root + a1 = self.h5file.create_array(root, 'a1', a) + b1 = self.h5file.create_array(root, 'b1', b) + two = np.int32(2) + self.assertIsInstance(two, np.integer) + expr = tb.Expr("two * a1-b1") + r1 = expr.eval() + a = np.array([1, 2, 3], 'int32') + b = np.array([3, 4, 5], 'int32') + r2 = eval("two * a-b") + if common.verbose: + print("Computed expression:", repr(r1), r1.dtype) + print("Should look like:", repr(r2), r2.dtype) + self.assertEqual(r1.dtype, r2.dtype) + self.assertTrue(common.areArraysEqual(r1, r2), + "Evaluate is returning a wrong value.") + # Remove created leaves + a1.remove() + b1.remove() + + def test02_longint(self): + """Checking uint32 and int64 in expression.""" + + for dtype in 'uint32', 'int64': + if common.verbose: + print("Checking type:", dtype) + # Build arrays with different shapes as inputs + a = np.array([1, 2, 3], dtype) + b = np.array([3, 4, 5], dtype) + root = self.h5file.root + a1 = self.h5file.create_array(root, 'a1', a) + b1 = self.h5file.create_array(root, 'b1', b) + expr = tb.Expr("2 * a1-b1") + r1 = expr.eval() + a = np.array([1, 2, 3], 'int64') + b = np.array([3, 4, 5], 'int64') + r2 = eval("2 * a-b") + if common.verbose: + print("Computed expression:", repr(r1), r1.dtype) + print("Should look like:", repr(r2), r2.dtype) + self.assertEqual(r1.dtype, r2.dtype) + self.assertTrue(common.areArraysEqual(r1, r2), + "Evaluate is returning a wrong value.") + # Remove created leaves + a1.remove() + b1.remove() + + def test03_float(self): + """Checking float32 and float64 in expression.""" + + for dtype in 'float32', 'float64': + if common.verbose: + print("Checking type:", dtype) + # Build arrays with different shapes as inputs + a = np.array([1, 2, 3], dtype) + b = np.array([3, 4, 5], dtype) + root = self.h5file.root + a1 = self.h5file.create_array(root, 'a1', a) + b1 = self.h5file.create_array(root, 'b1', b) + expr = tb.Expr("2 * a1-b1") + r1 = expr.eval() + a = np.array([1, 2, 3], dtype) + b = np.array([3, 4, 5], dtype) + r2 = eval("2 * a-b") + if common.verbose: + print("Computed expression:", repr(r1), r1.dtype) + print("Should look like:", repr(r2), r2.dtype) + self.assertEqual(r1.dtype, r2.dtype) + self.assertTrue(common.areArraysEqual(r1, r2), + "Evaluate is returning a wrong value.") + # Remove created leaves + a1.remove() + b1.remove() + + def test04_complex(self): + """Checking complex64 and complex128 in expression.""" + + for dtype in 'complex64', 'complex128': + if common.verbose: + print("Checking type:", dtype) + # Build arrays with different shapes as inputs + a = np.array([1, 2j, 3 + 2j], dtype) + b = np.array([3, 4j, 5 + 1j], dtype) + root = self.h5file.root + a1 = self.h5file.create_array(root, 'a1', a) + b1 = self.h5file.create_array(root, 'b1', b) + expr = tb.Expr("2 * a1-b1") + r1 = expr.eval() + a = np.array([1, 2j, 3 + 2j], 'complex128') + b = np.array([3, 4j, 5 + 1j], 'complex128') + r2 = eval("2 * a-b") + if common.verbose: + print("Computed expression:", repr(r1), r1.dtype) + print("Should look like:", repr(r2), r2.dtype) + self.assertEqual(r1.dtype, r2.dtype) + self.assertTrue(common.areArraysEqual(r1, r2), + "Evaluate is returning a wrong value.") + # Remove created leaves + a1.remove() + b1.remove() + + def test05_string(self): + """Checking strings in expression.""" + + # Build arrays with different shapes as inputs + a = np.array(['a', 'bd', 'cd'], 'S') + b = np.array(['a', 'bdcd', 'ccdc'], 'S') + root = self.h5file.root + a1 = self.h5file.create_array(root, 'a1', a) + self.assertIsNotNone(a1) + b1 = self.h5file.create_array(root, 'b1', b) + self.assertIsNotNone(b1) + expr = tb.Expr("(a1 > b'a') | ( b1 > b'b')") + r1 = expr.eval() + r2 = eval("(a > b'a') | ( b > b'b')") + if common.verbose: + print("Computed expression:", repr(r1), r1.dtype) + print("Should look like:", repr(r2), r2.dtype) + self.assertTrue(common.areArraysEqual(r1, r2), + "Evaluate is returning a wrong value.") + + +# Test for different functions +class FunctionsTestCase(common.TempFileMixin, common.PyTablesTestCase): + + def test00_simple(self): + """Checking some math functions in expression.""" + + # Build arrays with different shapes as inputs + a = np.array([.1, .2, .3]) + b = np.array([.3, .4, .5]) + root = self.h5file.root + a1 = self.h5file.create_array(root, 'a1', a) + self.assertIsNotNone(a1) + b1 = self.h5file.create_array(root, 'b1', b) + self.assertIsNotNone(b1) + # The expression + expr = tb.Expr("sin(a1) * sqrt(b1)") + r1 = expr.eval() + r2 = np.sin(a) * np.sqrt(b) + if common.verbose: + print("Computed expression:", repr(r1), r1.dtype) + print("Should look like:", repr(r2), r2.dtype) + self.assertTrue(common.areArraysEqual(r1, r2), + "Evaluate is returning a wrong value.") + + +# Test for EArrays with maindim != 0 +class MaindimTestCase(common.TempFileMixin, common.PyTablesTestCase): + + def test00_simple(self): + """Checking other dimensions than 0 as main dimension.""" + + shape = list(self.shape) + # Build input arrays + a = np.arange(np.prod(shape), dtype="i4").reshape(shape) + b = a.copy() + c = a.copy() + root = self.h5file.root + shape[self.maindim] = 0 + a1 = self.h5file.create_earray( + root, 'a1', atom=tb.Int32Col(), shape=shape) + b1 = self.h5file.create_earray( + root, 'b1', atom=tb.Int32Col(), shape=shape) + c1 = self.h5file.create_earray( + root, 'c1', atom=tb.Int32Col(), shape=shape) + a1.append(a) + b1.append(b) + c1.append(c) + # The expression + expr = tb.Expr("2 * a1 + b1-c1") + r1 = expr.eval() + r2 = eval("2 * a + b-c") + if common.verbose: + print("Tested shape:", shape) + print("Computed expression:", repr(r1), r1.dtype) + print("Should look like:", repr(r2), r2.dtype) + self.assertTrue(common.areArraysEqual(r1, r2), + "Evaluate is returning a wrong value.") + + def test01_out(self): + """Checking other dimensions than 0 as main dimension (out)""" + + shape = list(self.shape) + # Build input arrays + a = np.arange(np.prod(shape), dtype="i4").reshape(shape) + b = a.copy() + c = a.copy() + root = self.h5file.root + shape[self.maindim] = 0 + a1 = self.h5file.create_earray( + root, 'a1', atom=tb.Int32Col(), shape=shape) + b1 = self.h5file.create_earray( + root, 'b1', atom=tb.Int32Col(), shape=shape) + c1 = self.h5file.create_earray( + root, 'c1', atom=tb.Int32Col(), shape=shape) + r1 = self.h5file.create_earray( + root, 'r1', atom=tb.Int32Col(), shape=shape) + a1.append(a) + b1.append(b) + c1.append(c) + r1.append(c) + # The expression + expr = tb.Expr("2 * a1 + b1-c1") + expr.set_output(r1) + expr.eval() + r2 = eval("2 * a + b-c") + if common.verbose: + print("Tested shape:", shape) + print("Computed expression:", repr(r1[:]), r1.dtype) + print("Should look like:", repr(r2), r2.dtype) + self.assertTrue(common.areArraysEqual(r1[:], r2), + "Evaluate is returning a wrong value.") + + def test02_diff_in_maindims(self): + """Checking different main dimensions in inputs.""" + + shape = list(self.shape) + # Build input arrays + a = np.arange(np.prod(shape), dtype="i4").reshape(shape) + b = a.copy() + c = a.copy() + root = self.h5file.root + shape2 = shape[:] + shape[self.maindim] = 0 + shape2[0] = 0 + a1 = self.h5file.create_earray( + root, 'a1', atom=tb.Int32Col(), shape=shape) + self.assertEqual(a1.maindim, self.maindim) + b1 = self.h5file.create_earray( + root, 'b1', atom=tb.Int32Col(), shape=shape2) + self.assertEqual(b1.maindim, 0) + c1 = self.h5file.create_earray( + root, 'c1', atom=tb.Int32Col(), shape=shape) + r1 = self.h5file.create_earray( + root, 'r1', atom=tb.Int32Col(), shape=shape) + a1.append(a) + b1.append(b) + c1.append(c) + r1.append(c) + # The expression + expr = tb.Expr("2 * a1 + b1-c1") + r1 = expr.eval() + r2 = eval("2 * a + b-c") + if common.verbose: + print("Tested shape:", shape) + print("Computed expression:", repr(r1), r1.dtype) + print("Should look like:", repr(r2), r2.dtype) + self.assertTrue(common.areArraysEqual(r1, r2), + "Evaluate is returning a wrong value.") + + def test03_diff_in_out_maindims(self): + """Checking different maindims in inputs and output.""" + + shape = list(self.shape) + # Build input arrays + a = np.arange(np.prod(shape), dtype="i4").reshape(shape) + b = a.copy() + c = a.copy() + root = self.h5file.root + shape2 = shape[:] + shape[self.maindim] = 0 + shape2[0] = 0 + a1 = self.h5file.create_earray( + root, 'a1', atom=tb.Int32Col(), shape=shape) + self.assertEqual(a1.maindim, self.maindim) + b1 = self.h5file.create_earray( + root, 'b1', atom=tb.Int32Col(), shape=shape) + c1 = self.h5file.create_earray( + root, 'c1', atom=tb.Int32Col(), shape=shape) + r1 = self.h5file.create_earray( + root, 'r1', atom=tb.Int32Col(), shape=shape2) + self.assertEqual(r1.maindim, 0) + a1.append(a) + b1.append(b) + c1.append(c) + r1.append(c) + # The expression + expr = tb.Expr("2 * a1 + b1-c1") + expr.set_output(r1) + expr.eval() + r2 = eval("2 * a + b-c") + if common.verbose: + print("Tested shape:", shape) + print("Computed expression:", repr(r1[:]), r1.dtype) + print("Should look like:", repr(r2), r2.dtype) + self.assertTrue(common.areArraysEqual(r1[:], r2), + "Evaluate is returning a wrong value.") + + def test04_diff_in_out_maindims_lengths(self): + """Checking different maindims and lengths in inputs and output.""" + + shape = list(self.shape) + # Build input arrays + a = np.arange(np.prod(shape), dtype="i4").reshape(shape) + b = a.copy() + c = a.copy() + root = self.h5file.root + shape2 = shape[:] + shape[self.maindim] = 0 + shape2[0] = 0 + a1 = self.h5file.create_earray( + root, 'a1', atom=tb.Int32Col(), shape=shape) + self.assertEqual(a1.maindim, self.maindim) + b1 = self.h5file.create_earray( + root, 'b1', atom=tb.Int32Col(), shape=shape) + c1 = self.h5file.create_earray( + root, 'c1', atom=tb.Int32Col(), shape=shape) + r1 = self.h5file.create_earray( + root, 'r1', atom=tb.Int32Col(), shape=shape2) + self.assertEqual(r1.maindim, 0) + a1.append(a) + a1.append(a) + b1.append(b) + b1.append(b) + c1.append(c) + c1.append(c) + r1.append(c) # just once so that output is smaller + # The expression + expr = tb.Expr("2 * a1 + b1-c1") + expr.set_output(r1) + # This should raise an error + self.assertRaises(ValueError, expr.eval) + + +class Maindim0(MaindimTestCase): + maindim = 1 + shape = (1, 2) + + +class Maindim1(MaindimTestCase): + maindim = 1 + shape = (2, 3) + + +class Maindim2(MaindimTestCase): + maindim = 1 + shape = (2, 3, 4) + + +class Maindim3(MaindimTestCase): + maindim = 2 + shape = (2, 3, 4) + + +# Test `append` mode flag in `set_output()` +class AppendModeTestCase(common.TempFileMixin, common.PyTablesTestCase): + + def test01_append(self): + """Checking append mode in `set_output()`""" + + shape = [3, 2] + # Build input arrays + a = np.arange(np.prod(shape), dtype="i4").reshape(shape) + b = a.copy() + c = a.copy() + shape[1] = 0 + root = self.h5file.root + a1 = self.h5file.create_earray( + root, 'a1', atom=tb.Int32Col(), shape=shape) + b1 = self.h5file.create_earray( + root, 'b1', atom=tb.Int32Col(), shape=shape) + c1 = self.h5file.create_earray( + root, 'c1', atom=tb.Int32Col(), shape=shape) + r1 = self.h5file.create_earray( + root, 'r1', atom=tb.Int32Col(), shape=shape) + a1.append(a) + b1.append(b) + c1.append(c) + if not self.append: + r1.append(c) + # The expression + expr = tb.Expr("2 * a1 + b1-c1") + expr.set_output(r1, append_mode=self.append) + expr.eval() + r2 = eval("2 * a + b-c") + if common.verbose: + print("Tested shape:", shape) + print("Computed expression:", repr(r1[:]), r1.dtype) + print("Should look like:", repr(r2), r2.dtype) + self.assertTrue(common.areArraysEqual(r1[:], r2), + "Evaluate is returning a wrong value.") + + +class AppendModeTrue(AppendModeTestCase): + append = True + + +class AppendModeFalse(AppendModeTestCase): + append = False + + +# Test for `__iter__()` iterator +class iterTestCase(common.TempFileMixin, common.PyTablesTestCase): + + def setUp(self): + super().setUp() + shape = list(self.shape) + # Build input arrays + a = np.arange(np.prod(shape), dtype="i4").reshape(shape) + b = a.copy() + c = a.copy() + self.npvars = {'a': a, 'b': b, 'c': c} + shape[self.maindim] = 0 + root = self.h5file.root + a1 = self.h5file.create_earray( + root, 'a1', atom=tb.Int32Col(), shape=shape) + b1 = self.h5file.create_earray( + root, 'b1', atom=tb.Int32Col(), shape=shape) + c1 = self.h5file.create_earray( + root, 'c1', atom=tb.Int32Col(), shape=shape) + a1.append(a) + b1.append(b) + c1.append(c) + self.vars = {'a': a1, 'b': b1, 'c': c1} + # The expression + self.sexpr = "2 * a + b-c" + + def test00_iter(self): + """Checking the __iter__ iterator.""" + + expr = tb.Expr(self.sexpr, self.vars) + r1 = np.array([row for row in expr]) + r2 = eval(self.sexpr, self.npvars) + if common.verbose: + print("Tested shape, maindim:", self.shape, self.maindim) + print("Computed expression:", repr(r1[:]), r1.dtype) + print("Should look like:", repr(r2), r2.dtype) + self.assertTrue(common.areArraysEqual(r1[:], r2), + "Evaluate is returning a wrong value.") + + def test01a_sss(self): + """Checking the __iter__ iterator (with ranges, I)""" + + start, stop, step = self.range_[0], None, None + expr = tb.Expr(self.sexpr, self.vars) + expr.set_inputs_range(start, stop, step) + r1 = np.array([row for row in expr]) + npvars = get_sliced_vars2( + self.npvars, start, stop, step, self.shape, self.maindim) + r2 = eval(self.sexpr, npvars) + if common.verbose: + print("Tested shape, maindim:", self.shape, self.maindim) + print("Computed expression:", repr(r1[:]), r1.dtype) + print("Should look like:", repr(r2), r2.dtype) + self.assertTrue(common.areArraysEqual(r1[:], r2), + "Evaluate is returning a wrong value.") + + def test01b_sss(self): + """Checking the __iter__ iterator (with ranges, II)""" + + start, stop, step = self.range_[0], self.range_[2], None + expr = tb.Expr(self.sexpr, self.vars) + expr.set_inputs_range(start, stop, step) + r1 = np.array([row for row in expr]) + npvars = get_sliced_vars2( + self.npvars, start, stop, step, self.shape, self.maindim) + r2 = eval(self.sexpr, npvars) + if common.verbose: + print("Tested shape, maindim:", self.shape, self.maindim) + print("Computed expression:", repr(r1[:]), r1.dtype) + print("Should look like:", repr(r2), r2.dtype) + self.assertTrue(common.areArraysEqual(r1[:], r2), + "Evaluate is returning a wrong value.") + + def test01c_sss(self): + """Checking the __iter__ iterator (with ranges, III)""" + + start, stop, step = self.range_ + expr = tb.Expr(self.sexpr, self.vars) + expr.set_inputs_range(start, stop, step) + r1 = np.array([row for row in expr]) + npvars = get_sliced_vars2( + self.npvars, start, stop, step, self.shape, self.maindim) + r2 = eval(self.sexpr, npvars) + if common.verbose: + print("Tested shape, maindim:", self.shape, self.maindim) + print("Computed expression:", repr(r1[:]), r1.dtype) + print("Should look like:", repr(r2), r2.dtype) + self.assertTrue(common.areArraysEqual(r1[:], r2), + "Evaluate is returning a wrong value.") + + +class iter0(iterTestCase): + maindim = 0 + shape = (0,) + range_ = (1, 2, 1) + + +class iter1(iterTestCase): + maindim = 0 + shape = (3,) + range_ = (1, 2, 1) + + +class iter2(iterTestCase): + maindim = 0 + shape = (3, 2) + range_ = (0, 3, 2) + + +class iter3(iterTestCase): + maindim = 1 + shape = (3, 2) + range_ = (0, 3, 2) + + +class iter4(iterTestCase): + maindim = 2 + shape = (3, 2, 1) + range_ = (1, 3, 2) + + +class iter5(iterTestCase): + maindim = 2 + shape = (1, 2, 5) + range_ = (0, 4, 2) + + +# Test for set_output_range +class setOutputRangeTestCase(common.TempFileMixin, common.PyTablesTestCase): + + def test00_simple(self): + """Checking the range selection for output.""" + + shape = list(self.shape) + start, stop, step = self.range_ + # Build input arrays + a = np.arange(np.prod(shape), dtype="i4").reshape(shape) + b = a.copy() + r = a.copy() + root = self.h5file.root + a1 = self.h5file.create_array(root, 'a1', a) + self.assertIsNotNone(a1) + b1 = self.h5file.create_array(root, 'b1', b) + self.assertIsNotNone(b1) + r1 = self.h5file.create_array(root, 'r1', r) + # The expression + expr = tb.Expr("a1-b1-1") + expr.set_output(r1) + expr.set_output_range(start, stop, step) + expr.eval() + r2 = eval("a-b-1") + r[start:stop:step] = r2[:len(range(start, stop, step))] + if common.verbose: + print("Tested shape:", shape) + print("Computed expression:", repr(r1[:]), r1.dtype) + print("Should look like:", repr(r), r.dtype) + self.assertTrue(common.areArraysEqual(r1[:], r), + "Evaluate is returning a wrong value.") + + def test01_maindim(self): + """Checking the range selection for output (maindim > 0)""" + + shape = list(self.shape) + start, stop, step = self.range_ + # Build input arrays + a = np.arange(np.prod(shape), dtype="i4").reshape(shape) + b = a.copy() + r = a.copy() + shape[self.maindim] = 0 + root = self.h5file.root + a1 = self.h5file.create_earray( + root, 'a1', atom=tb.Int32Col(), shape=shape) + b1 = self.h5file.create_earray( + root, 'b1', atom=tb.Int32Col(), shape=shape) + r1 = self.h5file.create_earray( + root, 'r1', atom=tb.Int32Col(), shape=shape) + a1.append(a) + b1.append(b) + r1.append(r) + # The expression + expr = tb.Expr("a1-b1-1") + expr.set_output(r1) + expr.set_output_range(start, stop, step) + expr.eval() + r2 = eval("a-b-1") + lsl = tuple([slice(None)] * self.maindim) + # print "lsl-->", lsl + (slice(start,stop,step),) + lrange = len(range(start, stop, step)) + r.__setitem__(lsl + (slice(start, stop, step),), + r2.__getitem__(lsl + (slice(0, lrange),))) + if common.verbose: + print("Tested shape:", shape) + print("Computed expression:", repr(r1[:]), r1.dtype) + print("Should look like:", repr(r), r.dtype) + self.assertTrue(common.areArraysEqual(r1[:], r), + "Evaluate is returning a wrong value.") + + +class setOutputRange0(setOutputRangeTestCase): + maindim = 0 + shape = (10,) + range_ = (0, 1, 2) + + +class setOutputRange1(setOutputRangeTestCase): + maindim = 0 + shape = (10,) + range_ = (0, 10, 2) + + +class setOutputRange2(setOutputRangeTestCase): + maindim = 0 + shape = (10,) + range_ = (1, 10, 2) + + +class setOutputRange3(setOutputRangeTestCase): + maindim = 0 + shape = (10, 1) + range_ = (1, 10, 3) + + +class setOutputRange4(setOutputRangeTestCase): + maindim = 0 + shape = (10, 2) + range_ = (1, 10, 3) + + +class setOutputRange5(setOutputRangeTestCase): + maindim = 0 + shape = (5, 3, 1) + range_ = (1, 5, 1) + + +class setOutputRange6(setOutputRangeTestCase): + maindim = 1 + shape = (2, 5) + range_ = (1, 3, 2) + + +class setOutputRange7(setOutputRangeTestCase): + maindim = 1 + shape = (2, 5, 1) + range_ = (1, 3, 2) + + +class setOutputRange8(setOutputRangeTestCase): + maindim = 2 + shape = (1, 3, 5) + range_ = (1, 5, 2) + + +class setOutputRange9(setOutputRangeTestCase): + maindim = 3 + shape = (1, 3, 4, 5) + range_ = (1, 5, 3) + + +# Test for very large inputs +class VeryLargeInputsTestCase(common.TempFileMixin, common.PyTablesTestCase): + + def test00_simple(self): + """Checking very large inputs.""" + + shape = self.shape + # Use filters so as to not use too much space + if tb.which_lib_version("blosc") is not None: + filters = tb.Filters(complevel=1, complib='blosc', + shuffle=False) + elif tb.which_lib_version("lzo") is not None: + filters = tb.Filters(complevel=1, complib='lzo', shuffle=False) + else: + filters = tb.Filters(complevel=1, shuffle=False) + # Build input arrays + root = self.h5file.root + a = self.h5file.create_carray(root, 'a', + atom=tb.Float64Atom(dflt=3), + shape=shape, filters=filters) + self.assertIsNotNone(a) + b = self.h5file.create_carray(root, 'b', + atom=tb.Float64Atom(dflt=2), + shape=shape, filters=filters) + self.assertIsNotNone(b) + r1 = self.h5file.create_carray(root, 'r1', + atom=tb.Float64Atom(dflt=3), + shape=shape, filters=filters) + # The expression + expr = tb.Expr("a * b-6") # Should give 0 + expr.set_output(r1) + expr.eval() + r1 = r1[-10:] # Get the last ten rows + r2 = np.zeros(10, dtype='float64') + if common.verbose: + print("Tested shape:", shape) + print("Ten last rows:", repr(r1), r1.dtype) + print("Should look like:", repr(r2), r2.dtype) + self.assertTrue(common.areArraysEqual(r1, r2), + "Evaluate is returning a wrong value.") + + def test01_iter(self): + """Checking very large inputs (__iter__ version)""" + + shape = self.shape + if shape[0] >= 2**24: + # The iterator is much more slower, so don't run it for + # extremeley large arrays. + if common.verbose: + print("Skipping this *very* long test") + return + # Use filters so as to not use too much space + if tb.which_lib_version("lzo") is not None: + filters = tb.Filters(complevel=1, complib='lzo', shuffle=False) + else: + filters = tb.Filters(complevel=1, shuffle=False) + + # Build input arrays + root = self.h5file.root + a = self.h5file.create_carray(root, 'a', + atom=tb.Int32Atom(dflt=1), + shape=shape, filters=filters) + self.assertIsNotNone(a) + b = self.h5file.create_carray(root, 'b', + atom=tb.Int32Atom(dflt=2), + shape=shape, filters=filters) + self.assertIsNotNone(b) + r1 = self.h5file.create_carray(root, 'r1', + atom=tb.Int32Atom(dflt=3), + shape=shape, filters=filters) + # The expression + expr = tb.Expr("a-b + 1") + r1 = sum(expr) # Should give 0 + if common.verbose: + print("Tested shape:", shape) + print("Cummulated sum:", r1) + print("Should look like:", 0) + self.assertEqual(r1, 0, "Evaluate is returning a wrong value.") + + +# The next can go on regular tests, as it should be light enough +class VeryLargeInputs1(VeryLargeInputsTestCase): + shape = (2**20,) # larger than any internal I/O buffers + + +# The next is only meant for 'heavy' mode as it can take more than 1 minute +# on modern machines +class VeryLargeInputs2(VeryLargeInputsTestCase): + shape = (2**32 + 1,) # check that arrays > 32-bit are supported + + +def suite(): + """Return a test suite consisting of all the test cases in the module.""" + + theSuite = common.unittest.TestSuite() + niter = 1 + # common.heavy = 1 # uncomment this only for testing purposes + + for i in range(niter): + theSuite.addTest(common.unittest.makeSuite(ExprNumPy)) + theSuite.addTest(common.unittest.makeSuite(ExprArray)) + theSuite.addTest(common.unittest.makeSuite(ExprCArray)) + theSuite.addTest(common.unittest.makeSuite(ExprEArray)) + theSuite.addTest(common.unittest.makeSuite(ExprColumn)) + theSuite.addTest(common.unittest.makeSuite(MixedContainers0)) + theSuite.addTest(common.unittest.makeSuite(MixedContainers1)) + theSuite.addTest(common.unittest.makeSuite(MixedContainers2)) + theSuite.addTest(common.unittest.makeSuite(MixedContainers3)) + theSuite.addTest(common.unittest.makeSuite(UnalignedObject)) + theSuite.addTest(common.unittest.makeSuite(NonContiguousObject)) + theSuite.addTest(common.unittest.makeSuite(ExprError)) + theSuite.addTest(common.unittest.makeSuite(Broadcast0)) + theSuite.addTest(common.unittest.makeSuite(Broadcast1)) + theSuite.addTest(common.unittest.makeSuite(Broadcast2)) + theSuite.addTest(common.unittest.makeSuite(Broadcast3)) + theSuite.addTest(common.unittest.makeSuite(Broadcast4)) + theSuite.addTest(common.unittest.makeSuite(Broadcast5)) + theSuite.addTest(common.unittest.makeSuite(DiffLength0)) + theSuite.addTest(common.unittest.makeSuite(DiffLength1)) + theSuite.addTest(common.unittest.makeSuite(DiffLength2)) + theSuite.addTest(common.unittest.makeSuite(DiffLength3)) + theSuite.addTest(common.unittest.makeSuite(DiffLength4)) + theSuite.addTest(common.unittest.makeSuite(TypesTestCase)) + theSuite.addTest(common.unittest.makeSuite(FunctionsTestCase)) + theSuite.addTest(common.unittest.makeSuite(Maindim0)) + theSuite.addTest(common.unittest.makeSuite(Maindim1)) + theSuite.addTest(common.unittest.makeSuite(Maindim2)) + theSuite.addTest(common.unittest.makeSuite(Maindim3)) + theSuite.addTest(common.unittest.makeSuite(AppendModeTrue)) + theSuite.addTest(common.unittest.makeSuite(AppendModeFalse)) + theSuite.addTest(common.unittest.makeSuite(iter0)) + theSuite.addTest(common.unittest.makeSuite(iter1)) + theSuite.addTest(common.unittest.makeSuite(iter2)) + theSuite.addTest(common.unittest.makeSuite(iter3)) + theSuite.addTest(common.unittest.makeSuite(iter4)) + theSuite.addTest(common.unittest.makeSuite(iter5)) + theSuite.addTest(common.unittest.makeSuite(setOutputRange0)) + theSuite.addTest(common.unittest.makeSuite(setOutputRange1)) + theSuite.addTest(common.unittest.makeSuite(setOutputRange2)) + theSuite.addTest(common.unittest.makeSuite(setOutputRange3)) + theSuite.addTest(common.unittest.makeSuite(setOutputRange4)) + theSuite.addTest(common.unittest.makeSuite(setOutputRange5)) + theSuite.addTest(common.unittest.makeSuite(setOutputRange6)) + theSuite.addTest(common.unittest.makeSuite(setOutputRange7)) + theSuite.addTest(common.unittest.makeSuite(setOutputRange8)) + theSuite.addTest(common.unittest.makeSuite(setOutputRange9)) + theSuite.addTest(common.unittest.makeSuite(VeryLargeInputs1)) + if common.heavy: + theSuite.addTest(common.unittest.makeSuite(VeryLargeInputs2)) + return theSuite + + +if __name__ == '__main__': + import sys + common.parse_argv(sys.argv) + common.print_versions() + common.unittest.main(defaultTest='suite') diff --git a/tables/tests/test_garbage.py b/tables/tests/test_garbage.py new file mode 100644 index 0000000..7914ff3 --- /dev/null +++ b/tables/tests/test_garbage.py @@ -0,0 +1,51 @@ +"""Test module for detecting uncollectable garbage in PyTables. + +This test module *must* be loaded in the last place. It just checks for +the existence of uncollectable garbage in ``gc.garbage`` after running +all the tests. + +""" + +import gc + +from tables.tests import common + + +class GarbageTestCase(common.PyTablesTestCase): + """Test for uncollectable garbage.""" + + def test00(self): + """Checking for uncollectable garbage.""" + + garbageLen = len(gc.garbage) + if garbageLen == 0: + return # success + + if common.verbose: + classCount = {} + # Count uncollected objects for each class. + for obj in gc.garbage: + objClass = obj.__class__.__name__ + if objClass in classCount: + classCount[objClass] += 1 + else: + classCount[objClass] = 1 + incidence = ['``%s``: %d' % (cls, cnt) + for (cls, cnt) in classCount.items()] + print("Class incidence:", ', '.join(incidence)) + self.fail("Possible leak: %d uncollected objects." % garbageLen) + + +def suite(): + """Return a test suite consisting of all the test cases in the module.""" + + theSuite = common.unittest.TestSuite() + theSuite.addTest(common.unittest.makeSuite(GarbageTestCase)) + return theSuite + + +if __name__ == '__main__': + import sys + common.parse_argv(sys.argv) + common.print_versions() + common.unittest.main(defaultTest='suite') diff --git a/tables/tests/test_hdf5compat.py b/tables/tests/test_hdf5compat.py new file mode 100644 index 0000000..12da99f --- /dev/null +++ b/tables/tests/test_hdf5compat.py @@ -0,0 +1,403 @@ +"""Test module for compatibility with plain HDF files.""" + +import shutil +import tempfile +from pathlib import Path + +import numpy as np + +import tables as tb +from tables.tests import common + + +class PaddedArrayTestCase(common.TestFileMixin, common.PyTablesTestCase): + """Test for H5T_COMPOUND (Table) datatype with padding. + + Regression test for issue gh-734 + + itemsize.h5 was created with h5py with the array `expectedData` (see below) + in the table `/Test`: + 'A' and 'B' are 4 + 4 bytes, with 8 bytes padding. + + $ h5ls -v itemsize.h5 + Test Dataset {3/3} + Location: 1:800 + Links: 1 + Storage: 48 logical bytes, 48 allocated bytes, 100.00% utilization + Type: struct { + "A" +0 native unsigned int + "B" +4 native unsigned int + } 16 bytes + + """ + h5fname = common.test_filename('itemsize.h5') + + def test(self): + arr = self.h5file.get_node('/Test') + data = arr.read() + expectedData = np.array( + [(1, 11), (2, 12), (3, 13)], + dtype={'names': ['A', 'B'], 'formats': [' 2 + table.row['var2'] = i % 2 + table.row['var3'] = i + table.row['var4'] = float(self.nrows - i - 1) + table.row.append() + table.flush() + # Index all entries: + for col in table.colinstances.values(): + indexrows = col.create_index(_blocksizes=small_blocksizes) + if common.verbose: + print("Number of written rows:", self.nrows) + print("Number of indexed rows:", indexrows) + + return + + def test00_flushLastRow(self): + """Checking flushing an Index incrementing only the last row.""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test00_flushLastRow..." % + self.__class__.__name__) + + # Open the HDF5 file in append mode + self.h5file = tb.open_file(self.h5fname, mode="a") + table = self.h5file.root.table + # Add just 3 rows more + for i in range(3): + table.row['var1'] = str(i).encode('ascii') + table.row.append() + table.flush() # redo the indexes + idxcol = table.cols.var1.index + if common.verbose: + print("Max rows in buf:", table.nrowsinbuf) + print("Number of elements per slice:", idxcol.slicesize) + print("Chunk size:", idxcol.sorted.chunksize) + print("Elements in last row:", idxcol.indicesLR[-1]) + + # Do a selection + results = [p["var1"] for p in table.where('var1 == b"1"')] + self.assertEqual(len(results), 2) + self.assertEqual(results, [b'1']*2) + + def test00_update(self): + """Checking automatic re-indexing after an update operation.""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test00_update..." % self.__class__.__name__) + + # Open the HDF5 file in append mode + self.h5file = tb.open_file(self.h5fname, mode="a") + table = self.h5file.root.table + # Modify a couple of columns + for i, row in enumerate(table.where("(var3>1) & (var3<5)")): + row['var1'] = str(i) + row['var3'] = i + row.update() + table.flush() # redo the indexes + idxcol1 = table.cols.var1.index + idxcol3 = table.cols.var3.index + if common.verbose: + print("Dirtyness of var1 col:", idxcol1.dirty) + print("Dirtyness of var3 col:", idxcol3.dirty) + self.assertEqual(idxcol1.dirty, False) + self.assertEqual(idxcol3.dirty, False) + + # Do a couple of selections + results = [p["var1"] for p in table.where('var1 == b"1"')] + self.assertEqual(len(results), 2) + self.assertEqual(results, [b'1']*2) + results = [p["var3"] for p in table.where('var3 == 0')] + self.assertEqual(len(results), 2) + self.assertEqual(results, [0]*2) + + def test01_readIndex(self): + """Checking reading an Index (string flavor)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test01_readIndex..." % self.__class__.__name__) + + # Open the HDF5 file in read-only mode + self.h5file = tb.open_file(self.h5fname, mode="r") + table = self.h5file.root.table + idxcol = table.cols.var1.index + if common.verbose: + print("Max rows in buf:", table.nrowsinbuf) + print("Number of elements per slice:", idxcol.slicesize) + print("Chunk size:", idxcol.sorted.chunksize) + + # Do a selection + results = [p["var1"] for p in table.where('var1 == b"1"')] + self.assertEqual(len(results), 1) + self.assertEqual(results, [b'1']) + + def test02_readIndex(self): + """Checking reading an Index (bool flavor)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test02_readIndex..." % self.__class__.__name__) + + # Open the HDF5 file in read-only mode + self.h5file = tb.open_file(self.h5fname, mode="r") + table = self.h5file.root.table + idxcol = table.cols.var2.index + if common.verbose: + print("Rows in table:", table.nrows) + print("Max rows in buf:", table.nrowsinbuf) + print("Number of elements per slice:", idxcol.slicesize) + print("Chunk size:", idxcol.sorted.chunksize) + + # Do a selection + results = [p["var2"] for p in table.where('var2 == True')] + if common.verbose: + print("Selected values:", results) + self.assertEqual(len(results), self.nrows // 2) + self.assertEqual(results, [True]*(self.nrows // 2)) + + def test03_readIndex(self): + """Checking reading an Index (int flavor)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test03_readIndex..." % self.__class__.__name__) + + # Open the HDF5 file in read-only mode + self.h5file = tb.open_file(self.h5fname, mode="r") + table = self.h5file.root.table + idxcol = table.cols.var3.index + if common.verbose: + print("Max rows in buf:", table.nrowsinbuf) + print("Number of elements per slice:", idxcol.slicesize) + print("Chunk size:", idxcol.sorted.chunksize) + + # Do a selection + results = [p["var3"] for p in table.where('(1 500: + tests.append(self.nrows - 500) + for limit in tests: + handle_a = [0, table.where('(var3 < e)', dict(e=limit))] + handle_b = [0, table.where('(var3 < e)', dict(e=limit))] + + try: + while True: + next(handle_b[1]) + handle_b[0] += 1 + except StopIteration: + for _ in handle_a[1]: + handle_a[0] += 1 + for _ in handle_b[1]: + handle_b[0] += 1 + + self.assertEqual(handle_a[0], limit) + self.assertEqual(handle_b[0], limit) + self.assertEqual( + len(list(table.where('(var3 < e)', dict(e=limit)))), limit) + + +small_ss = small_blocksizes[2] + + +class BasicReadTestCase(BasicTestCase): + compress = 0 + complib = "zlib" + shuffle = 0 + fletcher32 = 0 + nrows = small_ss + + +class ZlibReadTestCase(BasicTestCase): + compress = 1 + complib = "zlib" + shuffle = 0 + fletcher32 = 0 + nrows = small_ss + + +@common.unittest.skipIf(not common.blosc_avail, + 'BLOSC compression library not available') +class BloscReadTestCase(BasicTestCase): + compress = 1 + complib = "blosc" + shuffle = 0 + fletcher32 = 0 + nrows = small_ss + + +@common.unittest.skipIf(not common.lzo_avail, + 'LZO compression library not available') +class LZOReadTestCase(BasicTestCase): + compress = 1 + complib = "lzo" + shuffle = 0 + fletcher32 = 0 + nrows = small_ss + + +@common.unittest.skipIf(not common.bzip2_avail, + 'BZIP2 compression library not available') +class Bzip2ReadTestCase(BasicTestCase): + compress = 1 + complib = "bzip2" + shuffle = 0 + fletcher32 = 0 + nrows = small_ss + + +class ShuffleReadTestCase(BasicTestCase): + compress = 1 + complib = "zlib" + shuffle = 1 + fletcher32 = 0 + nrows = small_ss + + +class Fletcher32ReadTestCase(BasicTestCase): + compress = 1 + complib = "zlib" + shuffle = 0 + fletcher32 = 1 + nrows = small_ss + + +class ShuffleFletcher32ReadTestCase(BasicTestCase): + compress = 1 + complib = "zlib" + shuffle = 1 + fletcher32 = 1 + nrows = small_ss + + +class OneHalfTestCase(BasicTestCase): + nrows = small_ss + small_ss//2 + + +class UpperBoundTestCase(BasicTestCase): + nrows = small_ss + 1 + + +class LowerBoundTestCase(BasicTestCase): + nrows = small_ss * 2-1 + + +class DeepTableIndexTestCase(common.TempFileMixin, common.PyTablesTestCase): + nrows = minRowIndex + + def test01(self): + """Checking the indexing of a table in a 2nd level hierarchy""" + + # Create an instance of an HDF5 Table + group = self.h5file.create_group(self.h5file.root, "agroup") + # Create a table + title = "This is the IndexArray title" + table = self.h5file.create_table(group, 'table', TDescr, title, + None, self.nrows) + for i in range(self.nrows): + # Fill rows with defaults + table.row.append() + table.flush() + # Index some column + indexrows = table.cols.var1.create_index() + self.assertIsNotNone(indexrows) + idxcol = table.cols.var1.index + # Some sanity checks + self.assertEqual(table.colindexed["var1"], 1) + self.assertIsNotNone(idxcol) + self.assertEqual(idxcol.nelements, self.nrows) + + def test01b(self): + """Checking the indexing of a table in 2nd level + (persistent version)""" + + # Create an instance of an HDF5 Table + group = self.h5file.create_group(self.h5file.root, "agroup") + + # Create a table + title = "This is the IndexArray title" + table = self.h5file.create_table(group, 'table', TDescr, title, + None, self.nrows) + for i in range(self.nrows): + # Fill rows with defaults + table.row.append() + table.flush() + + # Index some column + indexrows = table.cols.var1.create_index() + self.assertIsNotNone(indexrows) + idxcol = table.cols.var1.index + + # Close and re-open this file + self._reopen(mode='a') + + table = self.h5file.root.agroup.table + idxcol = table.cols.var1.index + # Some sanity checks + self.assertEqual(table.colindexed["var1"], 1) + self.assertIsNotNone(idxcol) + self.assertEqual(idxcol.nelements, self.nrows) + + def test02(self): + """Checking the indexing of a table in a 4th level hierarchy""" + + # Create an instance of an HDF5 Table + group = self.h5file.create_group(self.h5file.root, "agroup") + group = self.h5file.create_group(group, "agroup") + group = self.h5file.create_group(group, "agroup") + + # Create a table + title = "This is the IndexArray title" + table = self.h5file.create_table(group, 'table', TDescr, title, + None, self.nrows) + for i in range(self.nrows): + # Fill rows with defaults + table.row.append() + table.flush() + + # Index some column + indexrows = table.cols.var1.create_index() + self.assertIsNotNone(indexrows) + idxcol = table.cols.var1.index + + # Some sanity checks + self.assertEqual(table.colindexed["var1"], 1) + self.assertIsNotNone(idxcol) + self.assertEqual(idxcol.nelements, self.nrows) + + def test02b(self): + """Checking the indexing of a table in a 4th level + (persistent version)""" + + # Create an instance of an HDF5 Table + group = self.h5file.create_group(self.h5file.root, "agroup") + group = self.h5file.create_group(group, "agroup") + group = self.h5file.create_group(group, "agroup") + + # Create a table + title = "This is the IndexArray title" + table = self.h5file.create_table(group, 'table', TDescr, title, + None, self.nrows) + for i in range(self.nrows): + # Fill rows with defaults + table.row.append() + table.flush() + + # Index some column + indexrows = table.cols.var1.create_index() + self.assertIsNotNone(indexrows) + idxcol = table.cols.var1.index + + # Close and re-open this file + self._reopen(mode='a') + + table = self.h5file.root.agroup.agroup.agroup.table + idxcol = table.cols.var1.index + + # Some sanity checks + self.assertEqual(table.colindexed["var1"], 1) + self.assertIsNotNone(idxcol) + self.assertEqual(idxcol.nelements, self.nrows) + + def test03(self): + """Checking the indexing of a table in a 100th level hierarchy""" + + # Create an instance of an HDF5 Table + group = self.h5file.root + for i in range(100): + group = self.h5file.create_group(group, "agroup") + + # Create a table + title = "This is the IndexArray title" + table = self.h5file.create_table(group, 'table', TDescr, title, + None, self.nrows) + for i in range(self.nrows): + # Fill rows with defaults + table.row.append() + table.flush() + + # Index some column + indexrows = table.cols.var1.create_index() + self.assertIsNotNone(indexrows) + idxcol = table.cols.var1.index + + # Some sanity checks + self.assertEqual(table.colindexed["var1"], 1) + self.assertIsNotNone(idxcol) + self.assertEqual(idxcol.nelements, self.nrows) + + +class IndexProps: + def __init__(self, auto=tb.index.default_auto_index, + filters=tb.index.default_index_filters): + self.auto = auto + self.filters = filters + + +DefaultProps = IndexProps() +NoAutoProps = IndexProps(auto=False) +ChangeFiltersProps = IndexProps( + filters=tb.Filters(complevel=6, complib="zlib", + shuffle=False, fletcher32=False)) + + +class AutomaticIndexingTestCase(common.TempFileMixin, common.PyTablesTestCase): + reopen = 1 + iprops = NoAutoProps + colsToIndex = ['var1', 'var2', 'var3'] + small_blocksizes = (16, 8, 4, 2) + + def setUp(self): + super().setUp() + + # Create an instance of an HDF5 Table + title = "This is the IndexArray title" + root = self.h5file.root + + # Make the chunkshape smaller or equal than small_blocksizes[-1] + chunkshape = (2,) + self.table = self.h5file.create_table(root, 'table', TDescr, title, + None, self.nrows, + chunkshape=chunkshape) + self.table.autoindex = self.iprops.auto + for colname in self.colsToIndex: + self.table.colinstances[colname].create_index( + _blocksizes=self.small_blocksizes) + for i in range(self.nrows): + # Fill rows with defaults + self.table.row.append() + self.table.flush() + if self.reopen: + self._reopen(mode='a') + self.table = self.h5file.root.table + + def test01_attrs(self): + """Checking indexing attributes (part1)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test01_attrs..." % self.__class__.__name__) + + table = self.table + if self.iprops is DefaultProps: + self.assertEqual(table.indexed, 0) + else: + self.assertEqual(table.indexed, 1) + if self.iprops is DefaultProps: + self.assertEqual(table.colindexed["var1"], 0) + self.assertIsNone(table.cols.var1.index) + self.assertEqual(table.colindexed["var2"], 0) + self.assertIsNone(table.cols.var2.index) + self.assertEqual(table.colindexed["var3"], 0) + self.assertIsNone(table.cols.var3.index) + self.assertEqual(table.colindexed["var4"], 0) + self.assertIsNone(table.cols.var4.index) + else: + # Check that the var1, var2 and var3 (and only these) + # has been indexed + self.assertEqual(table.colindexed["var1"], 1) + self.assertIsNotNone(table.cols.var1.index) + self.assertEqual(table.colindexed["var2"], 1) + self.assertIsNotNone(table.cols.var2.index) + self.assertEqual(table.colindexed["var3"], 1) + self.assertIsNotNone(table.cols.var3.index) + self.assertEqual(table.colindexed["var4"], 0) + self.assertIsNone(table.cols.var4.index) + + def test02_attrs(self): + """Checking indexing attributes (part2)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test02_attrs..." % self.__class__.__name__) + + table = self.table + + # Check the policy parameters + if common.verbose: + if table.indexed: + print("index props:", table.autoindex) + else: + print("Table is not indexed") + + # Check non-default values for index saving policy + if self.iprops is NoAutoProps: + self.assertFalse(table.autoindex) + elif self.iprops is ChangeFiltersProps: + self.assertTrue(table.autoindex) + + # Check Index() objects exists and are properly placed + if self.iprops is DefaultProps: + self.assertEqual(table.cols.var1.index, None) + self.assertEqual(table.cols.var2.index, None) + self.assertEqual(table.cols.var3.index, None) + self.assertEqual(table.cols.var4.index, None) + else: + self.assertIsInstance(table.cols.var1.index, tb.index.Index) + self.assertIsInstance(table.cols.var2.index, tb.index.Index) + self.assertIsInstance(table.cols.var3.index, tb.index.Index) + self.assertEqual(table.cols.var4.index, None) + + def test03_counters(self): + """Checking indexing counters""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test03_counters..." % self.__class__.__name__) + table = self.table + + # Check the counters for indexes + if common.verbose: + if table.indexed: + print("indexedrows:", table._indexedrows) + print("unsavedindexedrows:", table._unsaved_indexedrows) + index = table.cols.var1.index + print("table rows:", table.nrows) + print("computed indexed rows:", index.nrows * index.slicesize) + else: + print("Table is not indexed") + if self.iprops is not DefaultProps: + index = table.cols.var1.index + indexedrows = index.nelements + self.assertEqual(table._indexedrows, indexedrows) + indexedrows = index.nelements + self.assertEqual(table._unsaved_indexedrows, + self.nrows - indexedrows) + + def test04_noauto(self): + """Checking indexing counters (non-automatic mode)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test04_noauto..." % self.__class__.__name__) + table = self.table + + # Force a sync in indexes + table.flush_rows_to_index() + + # Check the counters for indexes + if common.verbose: + if table.indexed: + print("indexedrows:", table._indexedrows) + print("unsavedindexedrows:", table._unsaved_indexedrows) + index = table.cols.var1.index + print("computed indexed rows:", index.nelements) + else: + print("Table is not indexed") + + # No unindexated rows should remain + index = table.cols.var1.index + if self.iprops is DefaultProps: + self.assertIsNone(index) + else: + indexedrows = index.nelements + self.assertEqual(table._indexedrows, index.nelements) + self.assertEqual(table._unsaved_indexedrows, + self.nrows - indexedrows) + + # Check non-default values for index saving policy + if self.iprops is NoAutoProps: + self.assertFalse(table.autoindex) + elif self.iprops is ChangeFiltersProps: + self.assertTrue(table.autoindex) + + def test05_icounters(self): + """Checking indexing counters (remove_rows)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test05_icounters..." % self.__class__.__name__) + table = self.table + + # Force a sync in indexes + table.flush_rows_to_index() + + # Non indexated rows should remain here + if self.iprops is not DefaultProps: + indexedrows = table._indexedrows + unsavedindexedrows = table._unsaved_indexedrows + + # Now, remove some rows: + table.remove_rows(2, 4) + if self.reopen: + self._reopen(mode='a') + table = self.h5file.root.table + + # Check the counters for indexes + if common.verbose: + if table.indexed: + print("indexedrows:", table._indexedrows) + print("original indexedrows:", indexedrows) + print("unsavedindexedrows:", table._unsaved_indexedrows) + print("original unsavedindexedrows:", unsavedindexedrows) + # index = table.cols.var1.index + print("index dirty:", table.cols.var1.index.dirty) + else: + print("Table is not indexed") + + # Check the counters + self.assertEqual(table.nrows, self.nrows - 2) + if self.iprops is NoAutoProps: + self.assertTrue(table.cols.var1.index.dirty) + + # Check non-default values for index saving policy + if self.iprops is NoAutoProps: + self.assertFalse(table.autoindex) + elif self.iprops is ChangeFiltersProps: + self.assertTrue(table.autoindex) + + def test06_dirty(self): + """Checking dirty flags (remove_rows action)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test06_dirty..." % self.__class__.__name__) + table = self.table + + # Force a sync in indexes + table.flush_rows_to_index() + + # Now, remove some rows: + table.remove_rows(3, 5) + if self.reopen: + self._reopen(mode='a') + table = self.h5file.root.table + + # Check the dirty flag for indexes + if common.verbose: + print("auto flag:", table.autoindex) + for colname in table.colnames: + if table.cols._f_col(colname).index: + print("dirty flag col %s: %s" % + (colname, table.cols._f_col(colname).index.dirty)) + # Check the flags + for colname in table.colnames: + if table.cols._f_col(colname).index: + if not table.autoindex: + self.assertEqual(table.cols._f_col(colname).index.dirty, + True) + else: + self.assertEqual(table.cols._f_col(colname).index.dirty, + False) + + def test07_noauto(self): + """Checking indexing counters (modify_rows, no-auto mode)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test07_noauto..." % self.__class__.__name__) + table = self.table + + # Force a sync in indexes + table.flush_rows_to_index() + + # No unindexated rows should remain here + if self.iprops is not DefaultProps: + indexedrows = table._indexedrows + unsavedindexedrows = table._unsaved_indexedrows + + # Now, modify just one row: + table.modify_rows(3, None, 1, [("asa", 0, 3, 3.1)]) + if self.reopen: + self._reopen(mode='a') + table = self.h5file.root.table + + # Check the counters for indexes + if common.verbose: + if table.indexed: + print("indexedrows:", table._indexedrows) + print("original indexedrows:", indexedrows) + print("unsavedindexedrows:", table._unsaved_indexedrows) + print("original unsavedindexedrows:", unsavedindexedrows) + index = table.cols.var1.index + print("computed indexed rows:", index.nelements) + else: + print("Table is not indexed") + + # Check the counters + self.assertEqual(table.nrows, self.nrows) + if self.iprops is NoAutoProps: + self.assertTrue(table.cols.var1.index.dirty) + + # Check the dirty flag for indexes + if common.verbose: + for colname in table.colnames: + if table.cols._f_col(colname).index: + print("dirty flag col %s: %s" % + (colname, table.cols._f_col(colname).index.dirty)) + for colname in table.colnames: + if table.cols._f_col(colname).index: + if not table.autoindex: + self.assertEqual(table.cols._f_col(colname).index.dirty, + True) + else: + self.assertEqual(table.cols._f_col(colname).index.dirty, + False) + + def test07b_noauto(self): + """Checking indexing queries (modify in iterator, no-auto mode)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test07b_noauto..." % self.__class__.__name__) + table = self.table + + # Force a sync in indexes + table.flush_rows_to_index() + + # Do a query that uses indexes + res = [row.nrow for row in table.where('(var2 == True) & (var3 > 0)')] + + # Now, modify just one row: + for row in table: + if row.nrow == 3: + row['var1'] = "asa" + row['var2'] = True + row['var3'] = 3 + row['var4'] = 3.1 + row.update() + table.flush() + if self.reopen: + self._reopen(mode='a') + table = self.h5file.root.table + + # Do a query that uses indexes + resq = [row.nrow for row in table.where('(var2 == True) & (var3 > 0)')] + res_ = res + [3] + if common.verbose: + print("AutoIndex?:", table.autoindex) + print("Query results (original):", res) + print("Query results (after modifying table):", resq) + print("Should look like:", res_) + self.assertEqual(res_, resq) + + def test07c_noauto(self): + """Checking indexing queries (append, no-auto mode)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test07c_noauto..." % self.__class__.__name__) + table = self.table + + # Force a sync in indexes + table.flush_rows_to_index() + + # Do a query that uses indexes + res = [row.nrow for row in table.where('(var2 == True) & (var3 > 0)')] + + # Now, append three rows + table.append([("asa", True, 1, 3.1)]) + table.append([("asb", True, 2, 3.1)]) + table.append([("asc", True, 3, 3.1)]) + table.flush() + if self.reopen: + self._reopen(mode='a') + table = self.h5file.root.table + + # Do a query that uses indexes + resq = [row.nrow for row in table.where('(var2 == True) & (var3 > 0)')] + res_ = res + [table.nrows-3, table.nrows-2, table.nrows-1] + if common.verbose: + print("AutoIndex?:", table.autoindex) + print("Query results (original):", res) + print("Query results (after modifying table):", resq) + print("Should look like:", res_) + self.assertEqual(res_, resq) + + def test08_dirty(self): + """Checking dirty flags (modify_columns)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test08_dirty..." % self.__class__.__name__) + table = self.table + + # Force a sync in indexes + table.flush_rows_to_index() + + # Non indexated rows should remain here + if self.iprops is not DefaultProps: + indexedrows = table._indexedrows + self.assertIsNotNone(indexedrows) + unsavedindexedrows = table._unsaved_indexedrows + self.assertIsNotNone(unsavedindexedrows) + + # Now, modify a couple of rows: + table.modify_columns(1, columns=[["asa", "asb"], [1., 2.]], + names=["var1", "var4"]) + if self.reopen: + self._reopen(mode='a') + table = self.h5file.root.table + + # Check the counters + self.assertEqual(table.nrows, self.nrows) + if self.iprops is NoAutoProps: + self.assertTrue(table.cols.var1.index.dirty) + + # Check the dirty flag for indexes + if common.verbose: + for colname in table.colnames: + if table.cols._f_col(colname).index: + print("dirty flag col %s: %s" % + (colname, table.cols._f_col(colname).index.dirty)) + for colname in table.colnames: + if table.cols._f_col(colname).index: + if not table.autoindex: + if colname in ["var1"]: + self.assertEqual( + table.cols._f_col(colname).index.dirty, True) + else: + self.assertEqual( + table.cols._f_col(colname).index.dirty, False) + else: + self.assertEqual(table.cols._f_col(colname).index.dirty, + False) + + def test09a_propIndex(self): + """Checking propagate Index feature in Table.copy() (attrs)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test09a_propIndex..." % self.__class__.__name__) + table = self.table + + # Don't force a sync in indexes + # table.flush_rows_to_index() + # Non indexated rows should remain here + if self.iprops is not DefaultProps: + indexedrows = table._indexedrows + self.assertIsNotNone(indexedrows) + unsavedindexedrows = table._unsaved_indexedrows + self.assertIsNotNone(unsavedindexedrows) + + # Now, remove some rows to make columns dirty + # table.remove_rows(3,5) + # Copy a Table to another location + table2 = table.copy("/", 'table2', propindexes=True) + if self.reopen: + self._reopen(mode='a') + table = self.h5file.root.table + table2 = self.h5file.root.table2 + + index1 = table.cols.var1.index + index2 = table2.cols.var1.index + if common.verbose: + print("Copied index:", index2) + print("Original index:", index1) + if index1: + print("Elements in copied index:", index2.nelements) + print("Elements in original index:", index1.nelements) + + # Check the counters + self.assertEqual(table.nrows, table2.nrows) + if table.indexed: + self.assertTrue(table2.indexed) + if self.iprops is DefaultProps: + # No index: the index should not exist + self.assertIsNone(index1) + self.assertIsNone(index2) + elif self.iprops is NoAutoProps: + self.assertIsNotNone(index2) + + # Check the dirty flag for indexes + if common.verbose: + for colname in table2.colnames: + if table2.cols._f_col(colname).index: + print("dirty flag col %s: %s" % + (colname, table2.cols._f_col(colname).index.dirty)) + for colname in table2.colnames: + if table2.cols._f_col(colname).index: + self.assertEqual(table2.cols._f_col(colname).index.dirty, + False) + + def test09b_propIndex(self): + """Checking that propindexes=False works""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test09b_propIndex..." % self.__class__.__name__) + table = self.table + + # Don't force a sync in indexes + # table.flush_rows_to_index() + # Non indexated rows should remain here + if self.iprops is not DefaultProps: + indexedrows = table._indexedrows + self.assertIsNotNone(indexedrows) + unsavedindexedrows = table._unsaved_indexedrows + self.assertIsNotNone(unsavedindexedrows) + + # Now, remove some rows to make columns dirty + # table.remove_rows(3,5) + # Copy a Table to another location + table2 = table.copy("/", 'table2', propindexes=False) + if self.reopen: + self._reopen(mode='a') + table = self.h5file.root.table + table2 = self.h5file.root.table2 + + if common.verbose: + print("autoindex?:", self.iprops.auto) + print("Copied index indexed?:", table2.cols.var1.is_indexed) + print("Original index indexed?:", table.cols.var1.is_indexed) + if self.iprops is DefaultProps: + # No index: the index should not exist + self.assertFalse(table2.cols.var1.is_indexed) + self.assertFalse(table.cols.var1.is_indexed) + elif self.iprops is NoAutoProps: + self.assertFalse(table2.cols.var1.is_indexed) + self.assertTrue(table.cols.var1.is_indexed) + + def test10_propIndex(self): + """Checking propagate Index feature in Table.copy() (values)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test10_propIndex..." % self.__class__.__name__) + table = self.table + + # Don't force a sync in indexes + # table.flush_rows_to_index() + # Non indexated rows should remain here + if self.iprops is not DefaultProps: + indexedrows = table._indexedrows + self.assertIsNotNone(indexedrows) + unsavedindexedrows = table._unsaved_indexedrows + self.assertIsNotNone(unsavedindexedrows) + + # Now, remove some rows to make columns dirty + # table.remove_rows(3,5) + # Copy a Table to another location + table2 = table.copy("/", 'table2', propindexes=True) + if self.reopen: + self._reopen(mode='a') + table = self.h5file.root.table + table2 = self.h5file.root.table2 + + index1 = table.cols.var3.index + index2 = table2.cols.var3.index + if common.verbose: + print("Copied index:", index2) + print("Original index:", index1) + if index1: + print("Elements in copied index:", index2.nelements) + print("Elements in original index:", index1.nelements) + + def test11_propIndex(self): + """Checking propagate Index feature in Table.copy() (dirty flags)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test11_propIndex..." % self.__class__.__name__) + table = self.table + + # Force a sync in indexes + table.flush_rows_to_index() + + # Non indexated rows should remain here + if self.iprops is not DefaultProps: + indexedrows = table._indexedrows + self.assertIsNotNone(indexedrows) + unsavedindexedrows = table._unsaved_indexedrows + self.assertIsNotNone(unsavedindexedrows) + + # Now, modify an indexed column and an unindexed one + # to make the "var1" dirty + table.modify_columns(1, columns=[["asa", "asb"], [1., 2.]], + names=["var1", "var4"]) + + # Copy a Table to another location + table2 = table.copy("/", 'table2', propindexes=True) + if self.reopen: + self._reopen(mode='a') + table = self.h5file.root.table + table2 = self.h5file.root.table2 + + index1 = table.cols.var1.index + index2 = table2.cols.var1.index + if common.verbose: + print("Copied index:", index2) + print("Original index:", index1) + if index1: + print("Elements in copied index:", index2.nelements) + print("Elements in original index:", index1.nelements) + + # Check the dirty flag for indexes + if common.verbose: + for colname in table2.colnames: + if table2.cols._f_col(colname).index: + print("dirty flag col %s: %s" % + (colname, table2.cols._f_col(colname).index.dirty)) + for colname in table2.colnames: + if table2.cols._f_col(colname).index: + if table2.autoindex: + # All the destination columns should be non-dirty because + # the copy removes the dirty state and puts the + # index in a sane state + self.assertEqual(table2.cols._f_col(colname).index.dirty, + False) + + +# minRowIndex = 10000 # just if one wants more indexed rows to be checked +class AI1TestCase(AutomaticIndexingTestCase): + # nrows = 10002 + nrows = 102 + reopen = 0 + iprops = NoAutoProps + colsToIndex = ['var1', 'var2', 'var3'] + + +class AI2TestCase(AutomaticIndexingTestCase): + # nrows = 10002 + nrows = 102 + reopen = 1 + iprops = NoAutoProps + colsToIndex = ['var1', 'var2', 'var3'] + + +class AI4bTestCase(AutomaticIndexingTestCase): + # nrows = 10012 + nrows = 112 + reopen = 1 + iprops = NoAutoProps + colsToIndex = ['var1', 'var2', 'var3'] + + +class AI5TestCase(AutomaticIndexingTestCase): + sbs, bs, ss, cs = tb.idxutils.calc_chunksize(minRowIndex, memlevel=1) + nrows = ss * 11-1 + reopen = 0 + iprops = NoAutoProps + colsToIndex = ['var1', 'var2', 'var3'] + + +class AI6TestCase(AutomaticIndexingTestCase): + sbs, bs, ss, cs = tb.idxutils.calc_chunksize(minRowIndex, memlevel=1) + nrows = ss * 21 + 1 + reopen = 1 + iprops = NoAutoProps + colsToIndex = ['var1', 'var2', 'var3'] + + +class AI7TestCase(AutomaticIndexingTestCase): + sbs, bs, ss, cs = tb.idxutils.calc_chunksize(minRowIndex, memlevel=1) + nrows = ss * 12-1 + # nrows = ss * 1-1 # faster test + reopen = 0 + iprops = NoAutoProps + colsToIndex = ['var1', 'var2', 'var3'] + + +class AI8TestCase(AutomaticIndexingTestCase): + sbs, bs, ss, cs = tb.idxutils.calc_chunksize(minRowIndex, memlevel=1) + nrows = ss * 15 + 100 + # nrows = ss * 1 + 100 # faster test + reopen = 1 + iprops = NoAutoProps + colsToIndex = ['var1', 'var2', 'var3'] + + +class AI9TestCase(AutomaticIndexingTestCase): + sbs, bs, ss, cs = tb.idxutils.calc_chunksize(minRowIndex, memlevel=1) + nrows = ss + reopen = 0 + iprops = DefaultProps + colsToIndex = [] + + +class AI10TestCase(AutomaticIndexingTestCase): + # nrows = 10002 + nrows = 102 + reopen = 1 + iprops = DefaultProps + colsToIndex = [] + + +class AI11TestCase(AutomaticIndexingTestCase): + # nrows = 10002 + nrows = 102 + reopen = 0 + iprops = ChangeFiltersProps + colsToIndex = ['var1', 'var2', 'var3'] + + +class AI12TestCase(AutomaticIndexingTestCase): + # nrows = 10002 + nrows = 102 + reopen = 0 + iprops = ChangeFiltersProps + colsToIndex = ['var1', 'var2', 'var3'] + + +class ManyNodesTestCase(common.TempFileMixin, common.PyTablesTestCase): + opem_kwargs = dict(node_cache_slots=64) + + def test00(self): + """Indexing many nodes in one single session (based on bug #26)""" + + IdxRecord = { + 'f0': tb.Int8Col(), + 'f1': tb.Int8Col(), + 'f2': tb.Int8Col(), + } + + for qn in range(5): + for sn in range(5): + qchr = 'chr' + str(qn) + name = 'chr' + str(sn) + path = "/at/%s/pt" % (qchr) + table = self.h5file.create_table( + path, name, IdxRecord, createparents=1) + table.cols.f0.create_index() + table.cols.f1.create_index() + table.cols.f2.create_index() + table.row.append() + table.flush() + + +class IndexPropsChangeTestCase(common.TempFileMixin, common.PyTablesTestCase): + """Test case for changing index properties in a table.""" + + class MyDescription(tb.IsDescription): + icol = tb.IntCol() + oldIndexProps = IndexProps() + newIndexProps = IndexProps(auto=False, filters=tb.Filters(complevel=9)) + + def setUp(self): + super().setUp() + + table = self.h5file.create_table('/', 'test', self.MyDescription) + table.autoindex = self.oldIndexProps.auto + row = table.row + for i in range(100): + row['icol'] = i % 25 + row.append() + table.flush() + self.table = table + + def test_attributes(self): + """Storing index properties as table attributes.""" + for refprops in [self.oldIndexProps, self.newIndexProps]: + self.assertEqual(self.table.autoindex, refprops.auto) + self.table.autoindex = self.newIndexProps.auto + + def test_copyattrs(self): + """Copying index properties attributes.""" + oldtable = self.table + newtable = oldtable.copy('/', 'test2') + self.assertEqual(oldtable.autoindex, newtable.autoindex) + + +class IndexFiltersTestCase(common.TempFileMixin, common.PyTablesTestCase): + """Test case for setting index filters.""" + + def setUp(self): + super().setUp() + description = {'icol': tb.IntCol()} + self.table = self.h5file.create_table('/', 'test', description) + + def test_createIndex(self): + """Checking input parameters in new indexes.""" + # Different from default. + argfilters = copy.copy(tb.index.default_index_filters) + argfilters.shuffle = not tb.index.default_index_filters.shuffle + + # Different both from default and the previous one. + idxfilters = copy.copy(tb.index.default_index_filters) + idxfilters.shuffle = not tb.index.default_index_filters.shuffle + idxfilters.fletcher32 = not tb.index.default_index_filters.fletcher32 + + icol = self.table.cols.icol + + # First create + icol.create_index(kind='ultralight', optlevel=4) + self.assertEqual(icol.index.kind, 'ultralight') + self.assertEqual(icol.index.optlevel, 4) + self.assertEqual(icol.index.filters, tb.index.default_index_filters) + icol.remove_index() + + # Second create + icol.create_index(kind='medium', optlevel=3, filters=argfilters) + self.assertEqual(icol.index.kind, 'medium') + self.assertEqual(icol.index.optlevel, 3) + self.assertEqual(icol.index.filters, argfilters) + icol.remove_index() + + def test_reindex(self): + """Checking input parameters in recomputed indexes.""" + icol = self.table.cols.icol + icol.create_index( + kind='full', optlevel=5, filters=tb.Filters(complevel=3)) + kind = icol.index.kind + optlevel = icol.index.optlevel + filters = icol.index.filters + icol.reindex() + ni = icol.index + if common.verbose: + print(f"Old parameters: {kind}, {optlevel}, {filters}") + print("New parameters: {}, {}, {}".format( + ni.kind, ni.optlevel, ni.filters)) + self.assertEqual(ni.kind, kind) + self.assertEqual(ni.optlevel, optlevel) + self.assertEqual(ni.filters, filters) + + +class OldIndexTestCase(common.TestFileMixin, common.PyTablesTestCase): + h5fname = common.test_filename("idx-std-1.x.h5") + + def test1_x(self): + """Check that files with 1.x indexes are recognized and warned.""" + + self.assertWarns(tb.exceptions.OldIndexWarning, + self.h5file.get_node, "/table") + + +# Sensible parameters for indexing with small blocksizes +small_blocksizes = (512, 128, 32, 8) + + +class CompletelySortedIndexTestCase(common.TempFileMixin, + common.PyTablesTestCase): + """Test case for testing a complete sort in a table.""" + + nrows = 100 + nrowsinbuf = 11 + + class MyDescription(tb.IsDescription): + rcol = tb.IntCol(pos=1) + icol = tb.IntCol(pos=2) + + def setUp(self): + super().setUp() + table = self.h5file.create_table('/', 'table', self.MyDescription) + row = table.row + nrows = self.nrows + for i in range(nrows): + row['rcol'] = i + row['icol'] = nrows - i + row.append() + table.flush() + self.table = table + self.icol = self.table.cols.icol + # A full index with maximum optlevel should always be completely sorted + self.icol.create_csindex(_blocksizes=small_blocksizes) + + def test00_isCompletelySortedIndex(self): + """Testing the Column.is_csi property.""" + + icol = self.icol + self.assertEqual(icol.index.is_csi, True) + icol.remove_index() + # Other kinds than full, should never return a CSI + icol.create_index(kind="medium", optlevel=9) + self.assertEqual(icol.index.is_csi, False) + icol.remove_index() + # As the table is small, lesser optlevels should be able to + # create a completely sorted index too. + icol.create_index(kind="full", optlevel=6) + self.assertEqual(icol.index.is_csi, True) + # Checking a CSI in a sorted copy + self.table.copy("/", 'table2', sortby='icol', checkCSI=True) + self.assertEqual(icol.index.is_csi, True) + + def test01_readSorted1(self): + """Testing the Index.read_sorted() method with no arguments.""" + + icol = self.icol + sortedcol = np.sort(icol[:]) + sortedcol2 = icol.index.read_sorted() + if common.verbose: + print("Original sorted column:", sortedcol) + print("The values from the index:", sortedcol2) + self.assertTrue(common.allequal(sortedcol, sortedcol2)) + + def test01_readSorted2(self): + """Testing the Index.read_sorted() method with arguments (I).""" + + icol = self.icol + sortedcol = np.sort(icol[:])[30:55] + sortedcol2 = icol.index.read_sorted(30, 55) + if common.verbose: + print("Original sorted column:", sortedcol) + print("The values from the index:", sortedcol2) + self.assertTrue(common.allequal(sortedcol, sortedcol2)) + + def test01_readSorted3(self): + """Testing the Index.read_sorted() method with arguments (II).""" + + icol = self.icol + sortedcol = np.sort(icol[:])[33:97] + sortedcol2 = icol.index.read_sorted(33, 97) + if common.verbose: + print("Original sorted column:", sortedcol) + print("The values from the index:", sortedcol2) + self.assertTrue(common.allequal(sortedcol, sortedcol2)) + + def test02_readIndices1(self): + """Testing the Index.read_indices() method with no arguments.""" + + icol = self.icol + indicescol = np.argsort(icol[:]).astype('uint64') + indicescol2 = icol.index.read_indices() + if common.verbose: + print("Original indices column:", indicescol) + print("The values from the index:", indicescol2) + self.assertTrue(common.allequal(indicescol, indicescol2)) + + def test02_readIndices2(self): + """Testing the Index.read_indices() method with arguments (I).""" + + icol = self.icol + indicescol = np.argsort(icol[:])[30:55].astype('uint64') + indicescol2 = icol.index.read_indices(30, 55) + if common.verbose: + print("Original indices column:", indicescol) + print("The values from the index:", indicescol2) + self.assertTrue(common.allequal(indicescol, indicescol2)) + + def test02_readIndices3(self): + """Testing the Index.read_indices() method with arguments (II).""" + + icol = self.icol + indicescol = np.argsort(icol[:])[33:97].astype('uint64') + indicescol2 = icol.index.read_indices(33, 97) + if common.verbose: + print("Original indices column:", indicescol) + print("The values from the index:", indicescol2) + self.assertTrue(common.allequal(indicescol, indicescol2)) + + def test02_readIndices4(self): + """Testing the Index.read_indices() method with arguments (III).""" + + icol = self.icol + indicescol = np.argsort(icol[:])[33:97:2].astype('uint64') + indicescol2 = icol.index.read_indices(33, 97, 2) + if common.verbose: + print("Original indices column:", indicescol) + print("The values from the index:", indicescol2) + self.assertTrue(common.allequal(indicescol, indicescol2)) + + def test02_readIndices5(self): + """Testing the Index.read_indices() method with arguments (IV).""" + + icol = self.icol + indicescol = np.argsort(icol[:])[33:55:5].astype('uint64') + indicescol2 = icol.index.read_indices(33, 55, 5) + if common.verbose: + print("Original indices column:", indicescol) + print("The values from the index:", indicescol2) + self.assertTrue(common.allequal(indicescol, indicescol2)) + + def test02_readIndices6(self): + """Testing the Index.read_indices() method with step only.""" + + icol = self.icol + indicescol = np.argsort(icol[:])[::3].astype('uint64') + indicescol2 = icol.index.read_indices(step=3) + if common.verbose: + print("Original indices column:", indicescol) + print("The values from the index:", indicescol2) + self.assertTrue(common.allequal(indicescol, indicescol2)) + + def test03_getitem1(self): + """Testing the Index.__getitem__() method with no arguments.""" + + icol = self.icol + indicescol = np.argsort(icol[:]).astype('uint64') + indicescol2 = icol.index[:] + if common.verbose: + print("Original indices column:", indicescol) + print("The values from the index:", indicescol2) + self.assertTrue(common.allequal(indicescol, indicescol2)) + + def test03_getitem2(self): + """Testing the Index.__getitem__() method with start.""" + + icol = self.icol + indicescol = np.argsort(icol[:])[31].astype('uint64') + indicescol2 = icol.index[31] + if common.verbose: + print("Original indices column:", indicescol) + print("The values from the index:", indicescol2) + self.assertTrue(common.allequal(indicescol, indicescol2)) + + def test03_getitem3(self): + """Testing the Index.__getitem__() method with start, stop.""" + + icol = self.icol + indicescol = np.argsort(icol[:])[2:16].astype('uint64') + indicescol2 = icol.index[2:16] + if common.verbose: + print("Original indices column:", indicescol) + print("The values from the index:", indicescol2) + self.assertTrue(common.allequal(indicescol, indicescol2)) + + def test04_itersorted1(self): + """Testing the Table.itersorted() method with no arguments.""" + + table = self.table + sortedtable = np.sort(table[:], order='icol') + sortedtable2 = np.array( + [row.fetch_all_fields() for row in table.itersorted( + 'icol')], dtype=table._v_dtype) + if common.verbose: + print("Original sorted table:", sortedtable) + print("The values from the iterator:", sortedtable2) + self.assertTrue(common.allequal(sortedtable, sortedtable2)) + + def test04_itersorted2(self): + """Testing the Table.itersorted() method with a start.""" + + table = self.table + sortedtable = np.sort(table[:], order='icol')[15:] + sortedtable2 = np.array( + [row.fetch_all_fields() for row in table.itersorted( + 'icol', start=15)], dtype=table._v_dtype) + if common.verbose: + print("Original sorted table:", sortedtable) + print("The values from the iterator:", sortedtable2) + self.assertTrue(common.allequal(sortedtable, sortedtable2)) + + def test04_itersorted3(self): + """Testing the Table.itersorted() method with a stop.""" + + table = self.table + sortedtable = np.sort(table[:], order='icol')[:20] + sortedtable2 = np.array( + [row.fetch_all_fields() for row in table.itersorted( + 'icol', stop=20)], dtype=table._v_dtype) + if common.verbose: + print("Original sorted table:", sortedtable) + print("The values from the iterator:", sortedtable2) + self.assertTrue(common.allequal(sortedtable, sortedtable2)) + + def test04_itersorted4(self): + """Testing the Table.itersorted() method with a start and stop.""" + + table = self.table + sortedtable = np.sort(table[:], order='icol')[15:20] + sortedtable2 = np.array( + [row.fetch_all_fields() for row in table.itersorted( + 'icol', start=15, stop=20)], dtype=table._v_dtype) + if common.verbose: + print("Original sorted table:", sortedtable) + print("The values from the iterator:", sortedtable2) + self.assertTrue(common.allequal(sortedtable, sortedtable2)) + + def test04_itersorted5(self): + """Testing the Table.itersorted() method with a start, stop and + step.""" + + table = self.table + sortedtable = np.sort(table[:], order='icol')[15:45:4] + sortedtable2 = np.array( + [row.fetch_all_fields() for row in table.itersorted( + 'icol', start=15, stop=45, step=4)], dtype=table._v_dtype) + if common.verbose: + print("Original sorted table:", sortedtable) + print("The values from the iterator:", sortedtable2) + self.assertTrue(common.allequal(sortedtable, sortedtable2)) + + def test04_itersorted6(self): + """Testing the Table.itersorted() method with a start, stop and + step.""" + + table = self.table + sortedtable = np.sort(table[:], order='icol')[33:55:5] + sortedtable2 = np.array( + [row.fetch_all_fields() for row in table.itersorted( + 'icol', start=33, stop=55, step=5)], dtype=table._v_dtype) + if common.verbose: + print("Original sorted table:", sortedtable) + print("The values from the iterator:", sortedtable2) + self.assertTrue(common.allequal(sortedtable, sortedtable2)) + + def test04_itersorted7(self): + """Testing the Table.itersorted() method with checkCSI=True.""" + + table = self.table + sortedtable = np.sort(table[:], order='icol') + sortedtable2 = np.array( + [row.fetch_all_fields() for row in table.itersorted( + 'icol', checkCSI=True)], dtype=table._v_dtype) + if common.verbose: + print("Original sorted table:", sortedtable) + print("The values from the iterator:", sortedtable2) + self.assertTrue(common.allequal(sortedtable, sortedtable2)) + + def test04_itersorted8(self): + """Testing the Table.itersorted() method with a start, stop and + negative step.""" + + # see also gh-252 + table = self.table + sortedtable = np.sort(table[:], order='icol')[55:33:-5] + sortedtable2 = np.array( + [row.fetch_all_fields() for row in table.itersorted( + 'icol', start=55, stop=33, step=-5)], dtype=table._v_dtype) + if common.verbose: + print("Original sorted table:", sortedtable) + print("The values from the iterator:", sortedtable2) + self.assertTrue(common.allequal(sortedtable, sortedtable2)) + + def test04_itersorted9(self): + """Testing the Table.itersorted() method with a negative step -5.""" + + # see also gh-252 + table = self.table + sortedtable = np.sort(table[:], order='icol')[::-5] + sortedtable2 = np.array( + [row.fetch_all_fields() for row in table.itersorted( + 'icol', step=-5)], dtype=table._v_dtype) + if common.verbose: + print("Original sorted table:", sortedtable) + print("The values from the iterator:", sortedtable2) + self.assertTrue(common.allequal(sortedtable, sortedtable2)) + + def test04_itersorted10(self): + """Testing the Table.itersorted() method with a negative step -1.""" + + # see also gh-252 + table = self.table + sortedtable = np.sort(table[:], order='icol')[::-1] + sortedtable2 = np.array( + [row.fetch_all_fields() for row in table.itersorted( + 'icol', step=-1)], dtype=table._v_dtype) + if common.verbose: + print("Original sorted table:", sortedtable) + print("The values from the iterator:", sortedtable2) + self.assertTrue(common.allequal(sortedtable, sortedtable2)) + + def test05_readSorted1(self): + """Testing the Table.read_sorted() method with no arguments.""" + + table = self.table + sortedtable = np.sort(table[:], order='icol') + sortedtable2 = table.read_sorted('icol') + if common.verbose: + print("Original sorted table:", sortedtable) + print("The values from read_sorted:", sortedtable2) + self.assertTrue(common.allequal(sortedtable, sortedtable2)) + + def test05_readSorted2(self): + """Testing the Table.read_sorted() method with a start.""" + + table = self.table + sortedtable = np.sort(table[:], order='icol')[16:17] + sortedtable2 = table.read_sorted('icol', start=16) + if common.verbose: + print("Original sorted table:", sortedtable) + print("The values from read_sorted:", sortedtable2) + self.assertTrue(common.allequal(sortedtable, sortedtable2)) + + def test05_readSorted3(self): + """Testing the Table.read_sorted() method with a start and stop.""" + + table = self.table + sortedtable = np.sort(table[:], order='icol')[16:33] + sortedtable2 = table.read_sorted('icol', start=16, stop=33) + if common.verbose: + print("Original sorted table:", sortedtable) + print("The values from read_sorted:", sortedtable2) + self.assertTrue(common.allequal(sortedtable, sortedtable2)) + + def test05_readSorted4(self): + """Testing the Table.read_sorted() method with a start, stop and + step.""" + + table = self.table + sortedtable = np.sort(table[:], order='icol')[33:55:5] + sortedtable2 = table.read_sorted('icol', start=33, stop=55, step=5) + if common.verbose: + print("Original sorted table:", sortedtable) + print("The values from read_sorted:", sortedtable2) + self.assertTrue(common.allequal(sortedtable, sortedtable2)) + + def test05_readSorted5(self): + """Testing the Table.read_sorted() method with only a step.""" + + table = self.table + sortedtable = np.sort(table[:], order='icol')[::3] + sortedtable2 = table.read_sorted('icol', step=3) + if common.verbose: + print("Original sorted table:", sortedtable) + print("The values from read_sorted:", sortedtable2) + self.assertTrue(common.allequal(sortedtable, sortedtable2)) + + def test05_readSorted6(self): + """Testing the Table.read_sorted() method with negative step.""" + + table = self.table + sortedtable = np.sort(table[:], order='icol')[::-1] + sortedtable2 = table.read_sorted('icol', step=-1) + if common.verbose: + print("Original sorted table:", sortedtable) + print("The values from read_sorted:", sortedtable2) + self.assertTrue(common.allequal(sortedtable, sortedtable2)) + + def test05_readSorted7(self): + """Testing the Table.read_sorted() method with negative step (II).""" + + table = self.table + sortedtable = np.sort(table[:], order='icol')[::-2] + sortedtable2 = table.read_sorted('icol', step=-2) + if common.verbose: + print("Original sorted table:", sortedtable) + print("The values from read_sorted:", sortedtable2) + self.assertTrue(common.allequal(sortedtable, sortedtable2)) + + def test05_readSorted8(self): + """Testing the Table.read_sorted() method with negative step (III)).""" + + table = self.table + sstart = 100-24-1 + sstop = 100-54-1 + sortedtable = np.sort(table[:], order='icol')[sstart:sstop:-1] + sortedtable2 = table.read_sorted('icol', start=24, stop=54, step=-1) + if common.verbose: + print("Original sorted table:", sortedtable) + print("The values from read_sorted:", sortedtable2) + self.assertTrue(common.allequal(sortedtable, sortedtable2)) + + def test05_readSorted9(self): + """Testing the Table.read_sorted() method with negative step (IV)).""" + + table = self.table + sstart = 100-14-1 + sstop = 100-54-1 + sortedtable = np.sort(table[:], order='icol')[sstart:sstop:-3] + sortedtable2 = table.read_sorted('icol', start=14, stop=54, step=-3) + if common.verbose: + print("Original sorted table:", sortedtable) + print("The values from read_sorted:", sortedtable2) + self.assertTrue(common.allequal(sortedtable, sortedtable2)) + + def test05_readSorted10(self): + """Testing the Table.read_sorted() method with negative step (V)).""" + + table = self.table + sstart = 100-24-1 + sstop = 100-25-1 + sortedtable = np.sort(table[:], order='icol')[sstart:sstop:-2] + sortedtable2 = table.read_sorted('icol', start=24, stop=25, step=-2) + if common.verbose: + print("Original sorted table:", sortedtable) + print("The values from read_sorted:", sortedtable2) + self.assertTrue(common.allequal(sortedtable, sortedtable2)) + + def test05_readSorted11(self): + """Testing the Table.read_sorted() method with start > stop.""" + + table = self.table + sstart = 100-137-1 + sstop = 100-25-1 + sortedtable = np.sort(table[:], order='icol')[sstart:sstop:-2] + sortedtable2 = table.read_sorted('icol', start=137, stop=25, step=-2) + if common.verbose: + print("Original sorted table:", sortedtable) + print("The values from read_sorted:", sortedtable2) + self.assertTrue(common.allequal(sortedtable, sortedtable2)) + + def test05a_readSorted12(self): + """Testing the Table.read_sorted() method with checkCSI (I).""" + + table = self.table + sortedtable = np.sort(table[:], order='icol') + sortedtable2 = table.read_sorted('icol', checkCSI=True) + if common.verbose: + print("Original sorted table:", sortedtable) + print("The values from read_sorted:", sortedtable2) + self.assertTrue(common.allequal(sortedtable, sortedtable2)) + + def test05b_readSorted12(self): + """Testing the Table.read_sorted() method with checkCSI (II).""" + + table = self.table + self.assertRaises(ValueError, + table.read_sorted, "rcol", checkCSI=False) + + def test06_copy_sorted1(self): + """Testing the Table.copy(sortby) method with no arguments.""" + + table = self.table + # Copy to another table + table.nrowsinbuf = self.nrowsinbuf + table2 = table.copy("/", 'table2', sortby="icol") + sortedtable = np.sort(table[:], order='icol') + sortedtable2 = table2[:] + if common.verbose: + print("Original sorted table:", sortedtable) + print("The values from copy:", sortedtable2) + self.assertTrue(common.allequal(sortedtable, sortedtable2)) + + def test06_copy_sorted2(self): + """Testing the Table.copy(sortby) method with step=-1.""" + + table = self.table + # Copy to another table + table.nrowsinbuf = self.nrowsinbuf + table2 = table.copy("/", 'table2', sortby="icol", step=-1) + sortedtable = np.sort(table[:], order='icol')[::-1] + sortedtable2 = table2[:] + if common.verbose: + print("Original sorted table:", sortedtable) + print("The values from copy:", sortedtable2) + self.assertTrue(common.allequal(sortedtable, sortedtable2)) + + def test06_copy_sorted3(self): + """Testing the Table.copy(sortby) method with only a start.""" + + table = self.table + # Copy to another table + table.nrowsinbuf = self.nrowsinbuf + table2 = table.copy("/", 'table2', sortby="icol", start=3) + sortedtable = np.sort(table[:], order='icol')[3:4] + sortedtable2 = table2[:] + if common.verbose: + print("Original sorted table:", sortedtable) + print("The values from copy:", sortedtable2) + self.assertTrue(common.allequal(sortedtable, sortedtable2)) + + def test06_copy_sorted4(self): + """Testing the Table.copy(sortby) method with start, stop.""" + + table = self.table + # Copy to another table + table.nrowsinbuf = self.nrowsinbuf + table2 = table.copy("/", 'table2', sortby="icol", start=3, stop=40) + sortedtable = np.sort(table[:], order='icol')[3:40] + sortedtable2 = table2[:] + if common.verbose: + print("Original sorted table:", sortedtable) + print("The values from copy:", sortedtable2) + self.assertTrue(common.allequal(sortedtable, sortedtable2)) + + def test06_copy_sorted5(self): + """Testing the Table.copy(sortby) method with start, stop, step.""" + + table = self.table + # Copy to another table + table.nrowsinbuf = self.nrowsinbuf + table2 = table.copy("/", 'table2', sortby="icol", + start=3, stop=33, step=5) + sortedtable = np.sort(table[:], order='icol')[3:33:5] + sortedtable2 = table2[:] + if common.verbose: + print("Original sorted table:", sortedtable) + print("The values from copy:", sortedtable2) + self.assertTrue(common.allequal(sortedtable, sortedtable2)) + + def test06_copy_sorted6(self): + """Testing the Table.copy(sortby) method after table re-opening.""" + + self._reopen(mode='a') + table = self.h5file.root.table + # Copy to another table + table.nrowsinbuf = self.nrowsinbuf + table2 = table.copy("/", 'table2', sortby="icol") + sortedtable = np.sort(table[:], order='icol') + sortedtable2 = table2[:] + if common.verbose: + print("Original sorted table:", sortedtable) + print("The values from copy:", sortedtable2) + self.assertTrue(common.allequal(sortedtable, sortedtable2)) + + def test06_copy_sorted7(self): + """Testing the `checkCSI` parameter of Table.copy() (I).""" + + table = self.table + + # Copy to another table + table.nrowsinbuf = self.nrowsinbuf + table2 = table.copy("/", 'table2', sortby="icol") + self.assertRaises(ValueError, + table2.copy, "/", 'table3', + sortby="rcol", checkCSI=False) + + def test06_copy_sorted8(self): + """Testing the `checkCSI` parameter of Table.copy() (II).""" + + table = self.table + + # Copy to another table + table.nrowsinbuf = self.nrowsinbuf + table2 = table.copy("/", 'table2', sortby="icol") + self.assertRaises(ValueError, + table2.copy, "/", 'table3', + sortby="rcol", checkCSI=True) + + def test07_isCSI_noelements(self): + """Testing the representation of an index with no elements.""" + + t2 = self.h5file.create_table('/', 't2', self.MyDescription) + irows = t2.cols.rcol.create_csindex() + if common.verbose: + print("repr(t2)-->\n", repr(t2)) + self.assertEqual(irows, 0) + self.assertEqual(t2.colindexes['rcol'].is_csi, False) + + +class ReadSortedIndexTestCase(common.TempFileMixin, common.PyTablesTestCase): + """Test case for testing sorted reading in a "full" sorted column.""" + + nrows = 100 + nrowsinbuf = 11 + + class MyDescription(tb.IsDescription): + rcol = tb.IntCol(pos=1) + icol = tb.IntCol(pos=2) + + def setUp(self): + super().setUp() + + table = self.h5file.create_table('/', 'table', self.MyDescription) + row = table.row + nrows = self.nrows + for i in range(nrows): + row['rcol'] = i + row['icol'] = nrows - i + row.append() + table.flush() + self.table = table + self.icol = self.table.cols.icol + # A full index with maximum optlevel should always be completely sorted + self.icol.create_index(optlevel=self.optlevel, kind="full", + _blocksizes=small_blocksizes) + + def test01_readSorted1(self): + """Testing the Table.read_sorted() method with no arguments.""" + + table = self.table + sortedtable = np.sort(table[:], order='icol') + sortedtable2 = table.read_sorted('icol') + if common.verbose: + print("Sorted table:", sortedtable) + print("The values from read_sorted:", sortedtable2) + # Compare with the sorted read table because we have no + # guarantees that read_sorted returns a completely sorted table + self.assertTrue(common.allequal( + sortedtable, np.sort(sortedtable2, order="icol"))) + + def test01_readSorted2(self): + """Testing the Table.read_sorted() method with no arguments + (re-open).""" + + self._reopen() + table = self.h5file.root.table + sortedtable = np.sort(table[:], order='icol') + sortedtable2 = table.read_sorted('icol') + if common.verbose: + print("Sorted table:", sortedtable) + print("The values from read_sorted:", sortedtable2) + # Compare with the sorted read table because we have no + # guarantees that read_sorted returns a completely sorted table + self.assertTrue(common.allequal( + sortedtable, np.sort(sortedtable2, order="icol"))) + + def test02_copy_sorted1(self): + """Testing the Table.copy(sortby) method.""" + + table = self.table + # Copy to another table + table.nrowsinbuf = self.nrowsinbuf + table2 = table.copy("/", 'table2', sortby="icol") + sortedtable = np.sort(table[:], order='icol') + sortedtable2 = np.sort(table2[:], order='icol') + if common.verbose: + print("Original table:", table2[:]) + print("The sorted values from copy:", sortedtable2) + self.assertTrue(common.allequal(sortedtable, sortedtable2)) + + def test02_copy_sorted2(self): + """Testing the Table.copy(sortby) method after table re-opening.""" + + self._reopen(mode='a') + table = self.h5file.root.table + # Copy to another table + table.nrowsinbuf = self.nrowsinbuf + table2 = table.copy("/", 'table2', sortby="icol") + sortedtable = np.sort(table[:], order='icol') + sortedtable2 = np.sort(table2[:], order='icol') + if common.verbose: + print("Original table:", table2[:]) + print("The sorted values from copy:", sortedtable2) + self.assertTrue(common.allequal(sortedtable, sortedtable2)) + + +class ReadSortedIndex0(ReadSortedIndexTestCase): + optlevel = 0 + + +class ReadSortedIndex3(ReadSortedIndexTestCase): + optlevel = 3 + + +class ReadSortedIndex6(ReadSortedIndexTestCase): + optlevel = 6 + + +class ReadSortedIndex9(ReadSortedIndexTestCase): + optlevel = 9 + + +class Issue156TestBase(common.TempFileMixin, common.PyTablesTestCase): + # field name in table according to which test_copysort() sorts the table + sort_field = None + + def setUp(self): + super().setUp() + + # create nested table + class Foo(tb.IsDescription): + frame = tb.UInt16Col() + + class Bar(tb.IsDescription): + code = tb.UInt16Col() + + table = self.h5file.create_table('/', 'foo', Foo, + filters=tb.Filters(3, 'zlib'), + createparents=True) + + self.h5file.flush() + + # fill table with 10 random numbers + for k in range(10): + row = table.row + row['frame'] = np.random.randint(0, 2**16-1) + row['Bar/code'] = np.random.randint(0, 2**16-1) + row.append() + + self.h5file.flush() + + def test_copysort(self): + # copy table + oldNode = self.h5file.get_node('/foo') + + # create completely sorted index on a main column + oldNode.colinstances[self.sort_field].create_csindex() + + # this fails on ade2ba123efd267fd31 + # see gh-156 + new_node = oldNode.copy(newname='foo2', overwrite=True, + sortby=self.sort_field, checkCSI=True, + propindexes=True) + + # check column is sorted + self.assertTrue(np.all( + new_node.col(self.sort_field) == + sorted(oldNode.col(self.sort_field)))) + # check index is available + self.assertIn(self.sort_field, new_node.colindexes) + # check CSI was propagated + self.assertTrue(new_node.colindexes[self.sort_field].is_csi) + + +class Issue156TestCase01(Issue156TestBase): + # sort by field from non nested entry + sort_field = 'frame' + + +class Issue156TestCase02(Issue156TestBase): + # sort by field from nested entry + sort_field = 'Bar/code' + + +class Issue119Time32ColTestCase(common.TempFileMixin, common.PyTablesTestCase): + """TimeCol not properly indexing.""" + + col_typ = tb.Time32Col + values = [ + 0.93240451618785880, + 0.76322375510776170, + 0.16695030056300875, + 0.91259117097807850, + 0.93977847053454630, + 0.51450406513503090, + 0.24452129962257563, + 0.85475938924825230, + 0.32512326762476930, + 0.75127635627046820, + ] + + def setUp(self): + super().setUp() + + class Descr(tb.IsDescription): + when = self.col_typ(pos=1) + value = tb.Float32Col(pos=2) + + self.table = self.h5file.create_table('/', 'test', Descr) + + self.t = 1321031471.0 # 11/11/11 11:11:11 + data = [(self.t + i, item) for i, item in enumerate(self.values)] + self.table.append(data) + self.h5file.flush() + + def test_timecol_issue(self): + tbl = self.table + t = self.t + + wherestr = '(when >= %d) & (when < %d)' % (t, t + 5) + + no_index = tbl.read_where(wherestr) + + tbl.cols.when.create_index(_verbose=False) + with_index = tbl.read_where(wherestr) + + self.assertTrue((no_index == with_index).all()) + + +class Issue119Time64ColTestCase(Issue119Time32ColTestCase): + col_typ = tb.Time64Col + + +class TestIndexingNans(common.TempFileMixin, common.PyTablesTestCase): + def test_issue_282(self): + trMap = {'index': tb.Int64Col(), 'values': tb.FloatCol()} + table = self.h5file.create_table('/', 'table', trMap) + + r = table.row + for i in range(5): + r['index'] = i + r['values'] = np.nan if i == 0 else i + r.append() + table.flush() + + table.cols.values.create_index() + + # retrieve + result = table.read_where('(values >= 0)') + self.assertEqual(len(result), 4) + + def test_issue_327(self): + table = self.h5file.create_table('/', 'table', dict( + index=tb.Int64Col(), + values=tb.FloatCol(shape=()), + values2=tb.FloatCol(shape=()), + )) + + r = table.row + for i in range(5): + r['index'] = i + r['values'] = np.nan if i == 2 or i == 3 else i + r['values2'] = i + r.append() + table.flush() + + table.cols.values.create_index() + table.cols.values2.create_index() + + results2 = table.read_where('(values2 > 0)') + self.assertEqual(len(results2), 4) + + results = table.read_where('(values > 0)') + self.assertEqual(len(results), 2) + + def test_issue_327_b(self): + table = self.h5file.create_table('/', 'table', dict( + index=tb.Int64Col(), + values=tb.FloatCol(shape=()), + values2=tb.FloatCol(shape=()), + )) + + r = table.row + for _ in range(100): + for i in range(5): + r['index'] = i + r['values'] = np.nan if i == 2 or i == 3 else i + r['values2'] = i + r.append() + table.flush() + + table.cols.values.create_index(_blocksizes=small_blocksizes) + table.cols.values2.create_index(_blocksizes=small_blocksizes) + + results2 = table.read_where('(values2 > 0)') + self.assertEqual(len(results2), 400) + + results = table.read_where('(values > 0)') + self.assertEqual(len(results), 200) + + def test_csindex_nans(self): + table = self.h5file.create_table('/', 'table', dict( + index=tb.Int64Col(), + values=tb.FloatCol(shape=()), + values2=tb.FloatCol(shape=()), + )) + + r = table.row + for x in range(100): + for i in range(5): + r['index'] = i + r['values'] = np.nan if i == 2 or i == 3 else i + r['values2'] = i + r.append() + table.flush() + + table.cols.values.create_csindex(_blocksizes=small_blocksizes) + table.cols.values2.create_csindex(_blocksizes=small_blocksizes) + + results2 = table.read_where('(values2 > 0)') + self.assertEqual(len(results2), 100*4) + + results = table.read_where('(values > 0)') + self.assertEqual(len(results), 100*2) + + +def suite(): + theSuite = common.unittest.TestSuite() + + niter = 1 + # heavy = 1 # Uncomment this only for testing purposes! + + for n in range(niter): + theSuite.addTest(common.unittest.makeSuite(BasicReadTestCase)) + theSuite.addTest(common.unittest.makeSuite(ZlibReadTestCase)) + theSuite.addTest(common.unittest.makeSuite(BloscReadTestCase)) + theSuite.addTest(common.unittest.makeSuite(LZOReadTestCase)) + theSuite.addTest(common.unittest.makeSuite(Bzip2ReadTestCase)) + theSuite.addTest(common.unittest.makeSuite(ShuffleReadTestCase)) + theSuite.addTest(common.unittest.makeSuite(Fletcher32ReadTestCase)) + theSuite.addTest( + common.unittest.makeSuite(ShuffleFletcher32ReadTestCase)) + theSuite.addTest(common.unittest.makeSuite(OneHalfTestCase)) + theSuite.addTest(common.unittest.makeSuite(UpperBoundTestCase)) + theSuite.addTest(common.unittest.makeSuite(LowerBoundTestCase)) + theSuite.addTest(common.unittest.makeSuite(AI1TestCase)) + theSuite.addTest(common.unittest.makeSuite(AI2TestCase)) + theSuite.addTest(common.unittest.makeSuite(AI9TestCase)) + theSuite.addTest(common.unittest.makeSuite(DeepTableIndexTestCase)) + theSuite.addTest(common.unittest.makeSuite(IndexPropsChangeTestCase)) + theSuite.addTest(common.unittest.makeSuite(IndexFiltersTestCase)) + theSuite.addTest(common.unittest.makeSuite(OldIndexTestCase)) + theSuite.addTest( + common.unittest.makeSuite(CompletelySortedIndexTestCase)) + theSuite.addTest(common.unittest.makeSuite(ManyNodesTestCase)) + theSuite.addTest(common.unittest.makeSuite(ReadSortedIndex0)) + theSuite.addTest(common.unittest.makeSuite(ReadSortedIndex3)) + theSuite.addTest(common.unittest.makeSuite(ReadSortedIndex6)) + theSuite.addTest(common.unittest.makeSuite(ReadSortedIndex9)) + theSuite.addTest(common.unittest.makeSuite(Issue156TestCase01)) + theSuite.addTest(common.unittest.makeSuite(Issue156TestCase02)) + theSuite.addTest(common.unittest.makeSuite(Issue119Time32ColTestCase)) + theSuite.addTest(common.unittest.makeSuite(Issue119Time64ColTestCase)) + theSuite.addTest(common.unittest.makeSuite(TestIndexingNans)) + if common.heavy: + # These are too heavy for normal testing + theSuite.addTest(common.unittest.makeSuite(AI4bTestCase)) + theSuite.addTest(common.unittest.makeSuite(AI5TestCase)) + theSuite.addTest(common.unittest.makeSuite(AI6TestCase)) + theSuite.addTest(common.unittest.makeSuite(AI7TestCase)) + theSuite.addTest(common.unittest.makeSuite(AI8TestCase)) + theSuite.addTest(common.unittest.makeSuite(AI10TestCase)) + theSuite.addTest(common.unittest.makeSuite(AI11TestCase)) + theSuite.addTest(common.unittest.makeSuite(AI12TestCase)) + + return theSuite + + +if __name__ == '__main__': + import sys + common.parse_argv(sys.argv) + common.print_versions() + common.unittest.main(defaultTest='suite') diff --git a/tables/tests/test_indexvalues.py b/tables/tests/test_indexvalues.py new file mode 100644 index 0000000..47503d8 --- /dev/null +++ b/tables/tests/test_indexvalues.py @@ -0,0 +1,3406 @@ +import random +import tempfile +from pathlib import Path + +import numpy as np + +import tables as tb +from tables.tests import common + + +# An alias for frozenset +fzset = frozenset + +# To make the tests values reproductibles +random.seed(19) + +# Sensible parameters for indexing with small blocksizes +small_blocksizes = (16, 8, 4, 2) # The smaller set of parameters... +# The size for medium indexes +minRowIndex = 1000 + + +class Small(tb.IsDescription): + var1 = tb.StringCol(itemsize=4, dflt=b"") + var2 = tb.BoolCol(dflt=0) + var3 = tb.IntCol(dflt=0) + var4 = tb.FloatCol(dflt=0) + + +class SelectValuesTestCase(common.TempFileMixin, common.PyTablesTestCase): + compress = 1 + complib = "zlib" + shuffle = 1 + fletcher32 = 0 + chunkshape = 10 + buffersize = 0 + random = 0 + values = None + reopen = False + + def setUp(self): + super().setUp() + + # Create an instance of an HDF5 Table + if common.verbose: + print("Checking index kind-->", self.kind) + self.rootgroup = self.h5file.root + self.populateFile() + + def populateFile(self): + # Set a seed for the random generator if needed. + # This is useful when one need reproductible results. + if self.random and hasattr(self, "seed"): + random.seed(self.seed) + group = self.rootgroup + # Create an table + title = "This is the IndexArray title" + filters = tb.Filters(complevel=self.compress, + complib=self.complib, + shuffle=self.shuffle, + fletcher32=self.fletcher32) + table1 = self.h5file.create_table(group, 'table1', Small, title, + filters, self.nrows, + chunkshape=(self.chunkshape,)) + table2 = self.h5file.create_table(group, 'table2', Small, title, + filters, self.nrows, + chunkshape=(self.chunkshape,)) + count = 0 + for i in range(0, self.nrows, self.nrep): + for j in range(self.nrep): + if self.random: + k = random.randrange(self.nrows) + elif self.values is not None: + lenvalues = len(self.values) + if i >= lenvalues: + i %= lenvalues + k = self.values[i] + else: + k = i + bk = str(k).encode('ascii') + table1.row['var1'] = bk + table2.row['var1'] = bk + table1.row['var2'] = k % 2 + table2.row['var2'] = k % 2 + table1.row['var3'] = k + table2.row['var3'] = k + table1.row['var4'] = float(self.nrows - k - 1) + table2.row['var4'] = float(self.nrows - k - 1) + table1.row.append() + table2.row.append() + count += 1 + table1.flush() + table2.flush() + if self.buffersize: + # Change the buffersize by default + table1.nrowsinbuf = self.buffersize + # Make sure nrowsinbuf is a multiple of chunkshape + table1.nrowsinbuf -= table1.nrowsinbuf % self.chunkshape + # Index all entries: + for col in table1.colinstances.values(): + indexrows = col.create_index( + kind=self.kind, _blocksizes=self.blocksizes) + if common.verbose: + print("Number of written rows:", table1.nrows) + print("Number of indexed rows:", indexrows) + + if self.reopen: + self._reopen(mode='a') # flavor changes + self.table1 = self.h5file.root.table1 + self.table2 = self.h5file.root.table1 + + def test01a(self): + """Checking selecting values from an Index (string flavor)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test01a..." % self.__class__.__name__) + + table1 = self.h5file.root.table1 + table2 = self.h5file.root.table2 + + # Convert the limits to the appropriate type + il = str(self.il).encode('ascii') + sl = str(self.sl).encode('ascii') + + # Do some selections and check the results + # First selection + t1var1 = table1.cols.var1 + results1 = [p["var1"] for p in + table1.where('(il<=t1var1)&(t1var1<=sl)')] + results2 = [p["var1"] for p in table2 + if il <= p["var1"] <= sl] + results1.sort() + results2.sort() + if common.verbose: + print("Should look like:", results2) + print("Length results:", len(results1)) + print("Should be:", len(results2)) + self.assertEqual(len(results1), len(results2)) + self.assertEqual(results1, results2) + + # Second selection + t1var1 = table1.cols.var1 + results1 = [p["var1"] for p in + table1.where('(il<=t1var1)&(t1var1 sl')] + results2 = [p["var1"] for p in table2 + if p["var1"] > sl] + results1.sort() + results2.sort() + if common.verbose: + print("Limit:", sl) + print("Length results:", len(results1)) + print("Should be:", len(results2)) + self.assertEqual(len(results1), len(results2)) + self.assertEqual(results1, results2) + + # Fourth selection + t1var1 = table1.cols.var1 + self.assertIsNotNone(t1var1) + results1 = [p["var1"] for p in table1.where('t1var1 >= sl')] + results2 = [p["var1"] for p in table2 + if p["var1"] >= sl] + results1.sort() + results2.sort() + if common.verbose: + print("Limit:", sl) + print("Length results:", len(results1)) + print("Should be:", len(results2)) + self.assertEqual(len(results1), len(results2)) + self.assertEqual(results1, results2) + + def test02a(self): + """Checking selecting values from an Index (bool flavor)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test02a..." % self.__class__.__name__) + + table1 = self.h5file.root.table1 + table2 = self.h5file.root.table2 + + # Do some selections and check the results + t1var2 = table1.cols.var2 + self.assertIsNotNone(t1var2) + results1 = [p["var2"] for p in table1.where('t1var2 == True')] + results2 = [p["var2"] for p in table2 if p["var2"] is True] + if common.verbose: + print("Length results:", len(results1)) + print("Should be:", len(results2)) + self.assertEqual(len(results1), len(results2)) + self.assertEqual(results1, results2) + + def test02b(self): + """Checking selecting values from an Index (bool flavor)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test02b..." % self.__class__.__name__) + + table1 = self.h5file.root.table1 + table2 = self.h5file.root.table2 + + # Do some selections and check the results + t1var2 = table1.cols.var2 + self.assertIsNotNone(t1var2) + results1 = [p["var2"] for p in table1.where('t1var2 == False')] + results2 = [p["var2"] for p in table2 if p["var2"] is False] + if common.verbose: + print("Length results:", len(results1)) + print("Should be:", len(results2)) + self.assertEqual(len(results1), len(results2)) + self.assertEqual(results1, results2) + + def test03a(self): + """Checking selecting values from an Index (int flavor)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test03a..." % self.__class__.__name__) + + table1 = self.h5file.root.table1 + table2 = self.h5file.root.table2 + + # Convert the limits to the appropriate type + il = int(self.il) + sl = int(self.sl) + + # Do some selections and check the results + t1col = table1.cols.var3 + self.assertIsNotNone(t1col) + + # First selection + results1 = [p["var3"] for p in table1.where('(il<=t1col)&(t1col<=sl)')] + results2 = [p["var3"] for p in table2 + if il <= p["var3"] <= sl] + # sort lists (indexing does not guarantee that rows are returned in + # order) + results1.sort() + results2.sort() + if common.verbose: + print("Length results:", len(results1)) + print("Should be:", len(results2)) + self.assertEqual(len(results1), len(results2)) + self.assertEqual(results1, results2) + + # Second selection + results1 = [p["var3"] for p in table1.where('(il<=t1col)&(t1col sl')] + results2 = [p["var3"] for p in table2 + if p["var3"] > sl] + # sort lists (indexing does not guarantee that rows are returned in + # order) + results1.sort() + results2.sort() + if common.verbose: + print("Limit:", sl) + print("Length results:", len(results1)) + print("Should be:", len(results2)) + self.assertEqual(len(results1), len(results2)) + self.assertEqual(results1, results2) + + # Fourth selection + results1 = [p["var3"] for p in table1.where('t1col >= sl')] + results2 = [p["var3"] for p in table2 + if p["var3"] >= sl] + # sort lists (indexing does not guarantee that rows are returned in + # order) + results1.sort() + results2.sort() + if common.verbose: + print("Limit:", sl) + print("Length results:", len(results1)) + print("Should be:", len(results2)) + self.assertEqual(len(results1), len(results2)) + self.assertEqual(results1, results2) + + def test03c(self): + """Checking selecting values from an Index (long flavor)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test03c..." % self.__class__.__name__) + + table1 = self.h5file.root.table1 + table2 = self.h5file.root.table2 + + # Convert the limits to the appropriate type + # il = long(self.il) + sl = int(self.sl) + + # Do some selections and check the results + t1col = table1.cols.var3 + self.assertIsNotNone(t1col) + + # First selection + results1 = [p["var3"] for p in table1.where('t1col < sl')] + results2 = [p["var3"] for p in table2 + if p["var3"] < sl] + # sort lists (indexing does not guarantee that rows are returned in + # order) + results1.sort() + results2.sort() + if common.verbose: + print("Limit:", sl) + print("Length results:", len(results1)) + print("Should be:", len(results2)) + self.assertEqual(len(results1), len(results2)) + self.assertEqual(results1, results2) + + # Second selection + results1 = [p["var3"] for p in table1.where('t1col <= sl')] + results2 = [p["var3"] for p in table2 + if p["var3"] <= sl] + # sort lists (indexing does not guarantee that rows are returned in + # order) + results1.sort() + results2.sort() + if common.verbose: + print("Limit:", sl) + print("Length results:", len(results1)) + print("Should be:", len(results2)) + self.assertEqual(len(results1), len(results2)) + self.assertEqual(results1, results2) + + # Third selection + results1 = [p["var3"] for p in table1.where('t1col > sl')] + results2 = [p["var3"] for p in table2 + if p["var3"] > sl] + # sort lists (indexing does not guarantee that rows are returned in + # order) + results1.sort() + results2.sort() + if common.verbose: + print("Limit:", sl) + print("Length results:", len(results1)) + print("Should be:", len(results2)) + self.assertEqual(len(results1), len(results2)) + self.assertEqual(results1, results2) + + # Fourth selection + results1 = [p["var3"] for p in table1.where('t1col >= sl')] + results2 = [p["var3"] for p in table2 + if p["var3"] >= sl] + # sort lists (indexing does not guarantee that rows are returned in + # order) + results1.sort() + results2.sort() + if common.verbose: + print("Limit:", sl) + print("Length results:", len(results1)) + print("Should be:", len(results2)) + self.assertEqual(len(results1), len(results2)) + self.assertEqual(results1, results2) + + def test03d(self): + """Checking selecting values from an Index (long and int flavor)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test03d..." % self.__class__.__name__) + + table1 = self.h5file.root.table1 + table2 = self.h5file.root.table2 + + # Convert the limits to the appropriate type + # il = int(self.il) + sl = int(self.sl) + + # Do some selections and check the results + t1col = table1.cols.var3 + self.assertIsNotNone(t1col) + + # First selection + results1 = [p["var3"] for p in table1.where('t1col < sl')] + results2 = [p["var3"] for p in table2 + if p["var3"] < sl] + # sort lists (indexing does not guarantee that rows are returned in + # order) + results1.sort() + results2.sort() + if common.verbose: + print("Limit:", sl) + print("Length results:", len(results1)) + print("Should be:", len(results2)) + self.assertEqual(len(results1), len(results2)) + self.assertEqual(results1, results2) + + # Second selection + results1 = [p["var3"] for p in table1.where('t1col <= sl')] + results2 = [p["var3"] for p in table2 + if p["var3"] <= sl] + # sort lists (indexing does not guarantee that rows are returned in + # order) + results1.sort() + results2.sort() + if common.verbose: + print("Limit:", sl) + print("Length results:", len(results1)) + print("Should be:", len(results2)) + self.assertEqual(len(results1), len(results2)) + self.assertEqual(results1, results2) + + # Third selection + results1 = [p["var3"] for p in table1.where('t1col > sl')] + results2 = [p["var3"] for p in table2 + if p["var3"] > sl] + # sort lists (indexing does not guarantee that rows are returned in + # order) + results1.sort() + results2.sort() + if common.verbose: + print("Limit:", sl) + print("Length results:", len(results1)) + print("Should be:", len(results2)) + self.assertEqual(len(results1), len(results2)) + self.assertEqual(results1, results2) + + # Fourth selection + results1 = [p["var3"] for p in table1.where('t1col >= sl')] + results2 = [p["var3"] for p in table2 + if p["var3"] >= sl] + # sort lists (indexing does not guarantee that rows are returned in + # order) + results1.sort() + results2.sort() + if common.verbose: + print("Limit:", sl) + print("Length results:", len(results1)) + print("Should be:", len(results2)) + self.assertEqual(len(results1), len(results2)) + self.assertEqual(results1, results2) + + def test04a(self): + """Checking selecting values from an Index (float flavor)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test04a..." % self.__class__.__name__) + + table1 = self.h5file.root.table1 + table2 = self.h5file.root.table2 + + # Convert the limits to the appropriate type + il = float(self.il) + sl = float(self.sl) + + # Do some selections and check the results + t1col = table1.cols.var4 + self.assertIsNotNone(t1col) + + # First selection + results1 = [p["var4"] for p in table1.where('(il<=t1col)&(t1col<=sl)')] + results2 = [p["var4"] for p in table2 + if il <= p["var4"] <= sl] + # sort lists (indexing does not guarantee that rows are returned in + # order) + results1.sort() + results2.sort() + if common.verbose: + print("Length results:", len(results1)) + print("Should be:", len(results2)) + self.assertEqual(len(results1), len(results2)) + self.assertEqual(results1.sort(), results2.sort()) + + # Second selection + results1 = [p["var4"] for p in table1.where('(il<=t1col)&(t1col sl')] + results2 = [p["var4"] for p in table2 + if p["var4"] > sl] + # sort lists (indexing does not guarantee that rows are returned in + # order) + results1.sort() + results2.sort() + if common.verbose: + print("Limit:", sl) + print("Length results:", len(results1)) + print("Should be:", len(results2)) + self.assertEqual(len(results1), len(results2)) + self.assertEqual(results1, results2) + + # Fourth selection + results1 = [p["var4"] for p in table1.where('t1col >= sl')] + results2 = [p["var4"] for p in table2 + if p["var4"] >= sl] + # sort lists (indexing does not guarantee that rows are returned in + # order) + results1.sort() + results2.sort() + if common.verbose: + print("Limit:", sl) + print("Length results:", len(results1)) + print("Should be:", len(results2)) + self.assertEqual(len(results1), len(results2)) + self.assertEqual(results1, results2) + + def test05a(self): + """Checking get_where_list & itersequence (string, python flavor)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test05a..." % self.__class__.__name__) + + table1 = self.h5file.root.table1 + table2 = self.h5file.root.table2 + + # Convert the limits to the appropriate type + il = str(self.il).encode('ascii') + sl = str(self.sl).encode('ascii') + + # Do some selections and check the results + t1col = table1.cols.var1 + # First selection + condition = '(il<=t1col)&(t1col<=sl)' + self.assertTrue( + table1.will_query_use_indexing(condition) == + fzset([t1col.pathname])) + table1.flavor = "python" + rowList1 = table1.get_where_list(condition) + results1 = [p['var1'] for p in table1.itersequence(rowList1)] + results2 = [p["var1"] for p in table2 + if il <= p["var1"] <= sl] + # sort lists (indexing does not guarantee that rows are returned in + # order) + results1.sort() + results2.sort() + if common.verbose: + print("Length results:", len(results1)) + print("Should be:", len(results2)) + self.assertEqual(len(results1), len(results2)) + self.assertEqual(results1.sort(), results2.sort()) + + # Second selection + condition = '(il<=t1col)&(t1col sl] + # sort lists (indexing does not guarantee that rows are returned in + # order) + results1.sort() + results2.sort() + if common.verbose: + print("Limit:", sl) + print("Length results:", len(results1)) + print("Should be:", len(results2)) + self.assertEqual(len(results1), len(results2)) + self.assertEqual(results1, results2) + + # Fourth selection + condition = 't1col>=sl' + self.assertTrue( + table1.will_query_use_indexing(condition) == + fzset([t1col.pathname])) + rowList1 = table1.get_where_list(condition) + results1 = [p['var1'] for p in table1.itersequence(rowList1)] + results2 = [p["var1"] for p in table2 if p["var1"] >= sl] + # sort lists (indexing does not guarantee that rows are returned in + # order) + results1.sort() + results2.sort() + if common.verbose: + print("Limit:", sl) + print("Length results:", len(results1)) + print("Should be:", len(results2)) + self.assertEqual(len(results1), len(results2)) + self.assertEqual(results1, results2) + + def test06a(self): + """Checking get_where_list & itersequence (bool flavor)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test06a..." % self.__class__.__name__) + + table1 = self.h5file.root.table1 + table2 = self.h5file.root.table2 + + # Do some selections and check the results + t1var2 = table1.cols.var2 + condition = 't1var2==True' + self.assertTrue( + table1.will_query_use_indexing(condition) == + fzset([t1var2.pathname])) + table1.flavor = "python" + rowList1 = table1.get_where_list(condition) + results1 = [p['var2'] for p in table1.itersequence(rowList1)] + results2 = [p["var2"] for p in table2 if p["var2"] is True] + if common.verbose: + print("Length results:", len(results1)) + print("Should be:", len(results2)) + self.assertEqual(len(results1), len(results2)) + self.assertEqual(results1, results2) + + def test06b(self): + """Checking get_where_list & itersequence (numpy bool limits & + flavor)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test06b..." % self.__class__.__name__) + + table1 = self.h5file.root.table1 + table2 = self.h5file.root.table2 + + # Do some selections and check the results + t1var2 = table1.cols.var2 + false = np.bool_(False) + self.assertFalse(false) # silence pyflakes + condition = 't1var2==false' + self.assertTrue( + table1.will_query_use_indexing(condition) == + fzset([t1var2.pathname])) + table1.flavor = "python" + rowList1 = table1.get_where_list(condition) + results1 = [p['var2'] for p in table1.itersequence(rowList1)] + results2 = [p["var2"] for p in table2 if p["var2"] is False] + if common.verbose: + print("Length results:", len(results1)) + print("Should be:", len(results2)) + self.assertEqual(len(results1), len(results2)) + self.assertEqual(results1, results2) + + def test07a(self): + """Checking get_where_list & itersequence (int flavor)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test07a..." % self.__class__.__name__) + + table1 = self.h5file.root.table1 + table2 = self.h5file.root.table2 + + # Convert the limits to the appropriate type + il = int(self.il) + sl = int(self.sl) + + # Do some selections and check the results + t1col = table1.cols.var3 + # First selection + condition = '(il<=t1col)&(t1col<=sl)' + self.assertTrue( + table1.will_query_use_indexing(condition) == + fzset([t1col.pathname])) + table1.flavor = "python" + rowList1 = table1.get_where_list(condition) + results1 = [p['var3'] for p in table1.itersequence(rowList1)] + results2 = [p["var3"] for p in table2 + if il <= p["var3"] <= sl] + # sort lists (indexing does not guarantee that rows are returned in + # order) + results1.sort() + results2.sort() + if common.verbose: + print("Length results:", len(results1)) + print("Should be:", len(results2)) + self.assertEqual(len(results1), len(results2)) + self.assertEqual(results1.sort(), results2.sort()) + + # Second selection + condition = '(il<=t1col)&(t1col sl] + # sort lists (indexing does not guarantee that rows are returned in + # order) + results1.sort() + results2.sort() + if common.verbose: + print("Limit:", sl) + print("Length results:", len(results1)) + print("Should be:", len(results2)) + self.assertEqual(len(results1), len(results2)) + self.assertEqual(results1, results2) + + # Fourth selection + condition = 't1col>=sl' + self.assertTrue( + table1.will_query_use_indexing(condition) == + fzset([t1col.pathname])) + rowList1 = table1.get_where_list(condition) + results1 = [p['var3'] for p in table1.itersequence(rowList1)] + results2 = [p["var3"] for p in table2 + if p["var3"] >= sl] + # sort lists (indexing does not guarantee that rows are returned in + # order) + results1.sort() + results2.sort() + if common.verbose: + print("Limit:", sl) + print("Length results:", len(results1)) + print("Should be:", len(results2)) + self.assertEqual(len(results1), len(results2)) + self.assertEqual(results1, results2) + + def test08a(self): + """Checking get_where_list & itersequence (float flavor)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test08a..." % self.__class__.__name__) + + table1 = self.h5file.root.table1 + table2 = self.h5file.root.table2 + + # Convert the limits to the appropriate type + il = float(self.il) + sl = float(self.sl) + + # Do some selections and check the results + t1col = table1.cols.var4 + # First selection + condition = '(il<=t1col)&(t1col<=sl)' + # results1 = [p["var4"] for p in table1.where(condition)] + self.assertTrue( + table1.will_query_use_indexing(condition) == + fzset([t1col.pathname])) + table1.flavor = "python" + rowList1 = table1.get_where_list(condition) + results1 = [p['var4'] for p in table1.itersequence(rowList1)] + results2 = [p["var4"] for p in table2 + if il <= p["var4"] <= sl] + # sort lists (indexing does not guarantee that rows are returned in + # order) + results1.sort() + results2.sort() + if common.verbose: + print("Length results:", len(results1)) + print("Should be:", len(results2)) + self.assertEqual(len(results1), len(results2)) + self.assertEqual(results1.sort(), results2.sort()) + + # Second selection + condition = '(il<=t1col)&(t1col sl] + # sort lists (indexing does not guarantee that rows are returned in + # order) + results1.sort() + results2.sort() + if common.verbose: + print("Limit:", sl) + print("Length results:", len(results1)) + print("Should be:", len(results2)) + self.assertEqual(len(results1), len(results2)) + self.assertEqual(results1, results2) + + # Fourth selection + condition = 't1col>=sl' + self.assertTrue( + table1.will_query_use_indexing(condition) == + fzset([t1col.pathname])) + rowList1 = table1.get_where_list(condition) + results1 = [p['var4'] for p in table1.itersequence(rowList1)] + results2 = [p["var4"] for p in table2 if p["var4"] >= sl] + # sort lists (indexing does not guarantee that rows are returned in + # order) + results1.sort() + results2.sort() + if common.verbose: + print("Limit:", sl) + print("Length results:", len(results1)) + print("Should be:", len(results2)) + self.assertEqual(len(results1), len(results2)) + self.assertEqual(results1, results2) + + def test09a(self): + """Checking non-indexed where() (string flavor)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test09a..." % self.__class__.__name__) + + table1 = self.h5file.root.table1 + table2 = self.h5file.root.table2 + + table1._disable_indexing_in_queries() + + # Convert the limits to the appropriate type + il = str(self.il).encode('ascii') + sl = str(self.sl).encode('ascii') + + # Do some selections and check the results + t1col = table1.cols.var1 + self.assertIsNotNone(t1col) + + # First selection + condition = 't1col<=sl' + self.assertTrue(not table1.will_query_use_indexing(condition)) + results1 = [p['var1'] for p in table1.where( + condition, start=2, stop=10)] + results2 = [p["var1"] for p in table2.iterrows(2, 10) + if p["var1"] <= sl] + if common.verbose: + print("Limit:", sl) + print("Length results:", len(results1)) + print("Should be:", len(results2)) + self.assertEqual(len(results1), len(results2)) + self.assertEqual(results1, results2) + + # Second selection + condition = '(il p["var1"] > sl) + ] + if common.verbose: + print("Limits:", il, sl) + print("Limit:", sl) + print("Length results:", len(results1)) + print("Should be:", len(results2)) + self.assertEqual(len(results1), len(results2)) + self.assertEqual(results1, results2) + + # This selection to be commented out +# condition = 't1col>=sl' +# self.assertTrue(not table1.will_query_use_indexing(condition)) +# results1 = [p['var1'] for p in table1.where(condition,start=2, +# stop=-1,step=1)] +# results2 = [p["var1"] for p in table2.iterrows(2, -1, 1) +# if p["var1"] >= sl] +# if verbose: +# print "Limit:", sl +# print "Selection results (in-kernel):", results1 +# print "Should look like:", results2 +# print "Length results:", len(results1) +# print "Should be:", len(results2) +# self.assertEqual(len(results1), len(results2)) +# self.assertEqual(results1, results2) + + # Fourth selection + # results1 = [p['var1'] for p in + # table1.where(condition,start=2,stop=-1,step=3)] + condition = 't1col>=sl' + self.assertTrue(not table1.will_query_use_indexing(condition)) + results1 = [p['var1'] for p in + table1.where(condition, start=2, stop=-1, step=3)] + results2 = [p["var1"] for p in table2.iterrows(2, -1, 3) + if p["var1"] >= sl] + if common.verbose: + print("Limits:", il, sl) + print("Length results:", len(results1)) + print("Should be:", len(results2)) + self.assertEqual(len(results1), len(results2)) + self.assertEqual(results1, results2) + + # Re-enable the indexing in queries basically to unnail the + # condition cache and not raising the performance warning + # about some indexes being dirty + table1._enable_indexing_in_queries() + + def test09b(self): + """Checking non-indexed where() (float flavor)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test09b..." % self.__class__.__name__) + + table1 = self.h5file.root.table1 + table2 = self.h5file.root.table2 + + table1._disable_indexing_in_queries() + + # Convert the limits to the appropriate type + il = float(self.il) + sl = float(self.sl) + + # Do some selections and check the results + t1col = table1.cols.var4 + self.assertIsNotNone(t1col) + + # First selection + condition = 't1col= sl] + if common.verbose: + print("Limit:", sl) + print("Length results:", len(results1)) + print("Should be:", len(results2)) + self.assertEqual(len(results1), len(results2)) + self.assertEqual(results1, results2) + + # Re-enable the indexing in queries basically to unnail the + # condition cache and not raising the performance warning + # about some indexes being dirty + table1._enable_indexing_in_queries() + + def test09c(self): + """Check non-indexed where() w/ ranges, changing step + (string flavor)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test09c..." % self.__class__.__name__) + + table1 = self.h5file.root.table1 + table2 = self.h5file.root.table2 + + table1._disable_indexing_in_queries() + + # Convert the limits to the appropriate type + il = str(self.il).encode('ascii') + sl = str(self.sl).encode('ascii') + + # Do some selections and check the results + t1col = table1.cols.var1 + self.assertIsNotNone(t1col) + + # First selection + condition = 't1col>=sl' + self.assertTrue(not table1.will_query_use_indexing(condition)) + results1 = [p['var1'] for p in + table1.where(condition, start=2, stop=-1, step=3)] + results2 = [p["var1"] for p in table2.iterrows(2, -1, 3) + if p["var1"] >= sl] + # sort lists (indexing does not guarantee that rows are returned in + # order) + results1.sort() + results2.sort() + if common.verbose: + print("Limits:", il, sl) + print("Length results:", len(results1)) + print("Should be:", len(results2)) + self.assertEqual(len(results1), len(results2)) + self.assertEqual(results1, results2) + + # Second selection + condition = 't1col>=sl' + self.assertTrue(not table1.will_query_use_indexing(condition)) + results1 = [p['var1'] for p in + table1.where(condition, start=5, stop=-1, step=10)] + results2 = [p["var1"] for p in table2.iterrows(5, -1, 10) + if p["var1"] >= sl] + # sort lists (indexing does not guarantee that rows are returned in + # order) + results1.sort() + results2.sort() + if common.verbose: + print("Limits:", il, sl) + print("Length results:", len(results1)) + print("Should be:", len(results2)) + self.assertEqual(len(results1), len(results2)) + self.assertEqual(results1, results2) + + # Third selection + condition = 't1col>=sl' + self.assertTrue(not table1.will_query_use_indexing(condition)) + results1 = [p['var1'] for p in + table1.where(condition, start=5, stop=-3, step=11)] + results2 = [p["var1"] for p in table2.iterrows(5, -3, 11) + if p["var1"] >= sl] + # sort lists (indexing does not guarantee that rows are returned in + # order) + results1.sort() + results2.sort() + if common.verbose: + print("Limits:", il, sl) + print("Length results:", len(results1)) + print("Should be:", len(results2)) + self.assertEqual(len(results1), len(results2)) + self.assertEqual(results1, results2) + + # Fourth selection + condition = 't1col>=sl' + self.assertTrue(not table1.will_query_use_indexing(condition)) + results1 = [p['var1'] for p in + table1.where(condition, start=2, stop=-1, step=300)] + results2 = [p["var1"] for p in table2.iterrows(2, -1, 300) + if p["var1"] >= sl] + # sort lists (indexing does not guarantee that rows are returned in + # order) + results1.sort() + results2.sort() + if common.verbose: + print("Limits:", il, sl) + print("Length results:", len(results1)) + print("Should be:", len(results2)) + self.assertEqual(len(results1), len(results2)) + self.assertEqual(results1, results2) + + # Re-enable the indexing in queries basically to unnail the + # condition cache and not raising the performance warning + # about some indexes being dirty + table1._enable_indexing_in_queries() + + def test09d(self): + """Checking non-indexed where() w/ ranges, changing step + (int flavor)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test09d..." % self.__class__.__name__) + + table1 = self.h5file.root.table1 + table2 = self.h5file.root.table2 + + table1._disable_indexing_in_queries() + + # Convert the limits to the appropriate type + il = int(self.il) + sl = int(self.sl) + + # Do some selections and check the results + t3col = table1.cols.var3 + self.assertIsNotNone(t3col) + + # First selection + condition = 't3col>=sl' + self.assertTrue(not table1.will_query_use_indexing(condition)) + results1 = [p['var3'] for p in + table1.where(condition, start=2, stop=-1, step=3)] + results2 = [p["var3"] for p in table2.iterrows(2, -1, 3) + if p["var3"] >= sl] + # sort lists (indexing does not guarantee that rows are returned in + # order) + results1.sort() + results2.sort() + if common.verbose: + print("Limits:", il, sl) + print("Length results:", len(results1)) + print("Should be:", len(results2)) + self.assertEqual(len(results1), len(results2)) + self.assertEqual(results1, results2) + + # Second selection + condition = 't3col>=sl' + self.assertTrue(not table1.will_query_use_indexing(condition)) + results1 = [p['var3'] for p in + table1.where(condition, start=5, stop=-1, step=10)] + results2 = [p["var3"] for p in table2.iterrows(5, -1, 10) + if p["var3"] >= sl] + # sort lists (indexing does not guarantee that rows are returned in + # order) + results1.sort() + results2.sort() + if common.verbose: + print("Limits:", il, sl) + print("Length results:", len(results1)) + print("Should be:", len(results2)) + self.assertEqual(len(results1), len(results2)) + self.assertEqual(results1, results2) + + # Third selection + condition = 't3col>=sl' + self.assertTrue(not table1.will_query_use_indexing(condition)) + results1 = [p['var3'] for p in + table1.where(condition, start=5, stop=-3, step=11)] + results2 = [p["var3"] for p in table2.iterrows(5, -3, 11) + if p["var3"] >= sl] + # sort lists (indexing does not guarantee that rows are returned in + # order) + results1.sort() + results2.sort() + if common.verbose: + print("Limits:", il, sl) + print("Length results:", len(results1)) + print("Should be:", len(results2)) + self.assertEqual(len(results1), len(results2)) + self.assertEqual(results1, results2) + + # Fourth selection + condition = 't3col>=sl' + self.assertTrue(not table1.will_query_use_indexing(condition)) + results1 = [p['var3'] for p in + table1.where(condition, start=2, stop=-1, step=300)] + results2 = [p["var3"] for p in table2.iterrows(2, -1, 300) + if p["var3"] >= sl] + # sort lists (indexing does not guarantee that rows are returned in + # order) + results1.sort() + results2.sort() + if common.verbose: + print("Limits:", il, sl) + print("Length results:", len(results1)) + print("Should be:", len(results2)) + self.assertEqual(len(results1), len(results2)) + self.assertEqual(results1, results2) + + # Re-enable the indexing in queries basically to unnail the + # condition cache and not raising the performance warning + # about some indexes being dirty + table1._enable_indexing_in_queries() + + def test10a(self): + """Checking indexed where() with ranges (string flavor)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test10a..." % self.__class__.__name__) + + table1 = self.h5file.root.table1 + table2 = self.h5file.root.table2 + + # Convert the limits to the appropriate type + il = str(self.il).encode('ascii') + sl = str(self.sl).encode('ascii') + + # Do some selections and check the results + t1col = table1.cols.var1 + # First selection + condition = 't1col<=sl' + self.assertTrue( + table1.will_query_use_indexing(condition) == + fzset([t1col.pathname])) + results1 = [ + p['var1'] for p in table1.where(condition, start=2, stop=10) + ] + results2 = [ + p["var1"] for p in table2.iterrows(2, 10) if p["var1"] <= sl + ] + # sort lists (indexing does not guarantee that rows are returned in + # order) + results1.sort() + results2.sort() + if common.verbose: + print("Limits:", il, sl) + print("Length results:", len(results1)) + print("Should be:", len(results2)) + self.assertEqual(len(results1), len(results2)) + self.assertEqual(results1, results2) + + # Second selection + condition = '(il<=t1col)&(t1col<=sl)' + self.assertTrue( + table1.will_query_use_indexing(condition) == + fzset([t1col.pathname])) + results1 = [ + p['var1'] for p in table1.where(condition, start=2, stop=30, + step=1) + ] + results2 = [ + p["var1"] for p in table2.iterrows(2, 30, 1) + if il <= p["var1"] <= sl + ] + # sort lists (indexing does not guarantee that rows are returned in + # order) + results1.sort() + results2.sort() + if common.verbose: + print("Limits:", il, sl) + print("Length results:", len(results1)) + print("Should be:", len(results2)) + self.assertEqual(len(results1), len(results2)) + self.assertEqual(results1, results2) + + # Repeat second selection (testing caches) + condition = '(il<=t1col)&(t1col<=sl)' + self.assertTrue( + table1.will_query_use_indexing(condition) == + fzset([t1col.pathname])) + results1 = [ + p['var1'] for p in table1.where(condition, start=2, stop=30, + step=2) + ] + results2 = [ + p["var1"] for p in table2.iterrows(2, 30, 2) + if il <= p["var1"] <= sl + ] + # sort lists (indexing does not guarantee that rows are returned in + # order) + results1.sort() + results2.sort() + if common.verbose: + print("Limits:", il, sl) + print("Selection results (indexed):", results1) + print("Should look like:", results2) + print("Length results:", len(results1)) + print("Should be:", len(results2)) + self.assertEqual(len(results1), len(results2)) + self.assertEqual(results1, results2) + + # Third selection + condition = '(il= sl + ] + # sort lists (indexing does not guarantee that rows are returned in + # order) + results1.sort() + results2.sort() + if common.verbose: + print("Limits:", il, sl) + print("Length results:", len(results1)) + print("Should be:", len(results2)) + self.assertEqual(len(results1), len(results2)) + self.assertEqual(results1, results2) + + def test10b(self): + """Checking indexed where() with ranges (int flavor)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test10b..." % self.__class__.__name__) + + table1 = self.h5file.root.table1 + table2 = self.h5file.root.table2 + + # Convert the limits to the appropriate type + il = int(self.il) + sl = int(self.sl) + + # Do some selections and check the results + t3col = table1.cols.var3 + # First selection + condition = 't3col<=sl' + self.assertTrue( + table1.will_query_use_indexing(condition) == + fzset([t3col.pathname])) + results1 = [ + p['var3'] for p in table1.where(condition, start=2, stop=10) + ] + results2 = [ + p["var3"] for p in table2.iterrows(2, 10) + if p["var3"] <= sl + ] + # sort lists (indexing does not guarantee that rows are returned in + # order) + results1.sort() + results2.sort() + if common.verbose: + print("Limits:", il, sl) + print("Length results:", len(results1)) + print("Should be:", len(results2)) + self.assertEqual(len(results1), len(results2)) + self.assertEqual(results1, results2) + + # Second selection + condition = '(il<=t3col)&(t3col<=sl)' + self.assertTrue( + table1.will_query_use_indexing(condition) == + fzset([t3col.pathname])) + results1 = [ + p['var3'] for p in table1.where(condition, start=2, stop=30, + step=2) + ] + results2 = [ + p["var3"] for p in table2.iterrows(2, 30, 2) + if il <= p["var3"] <= sl + ] + # sort lists (indexing does not guarantee that rows are returned in + # order) + results1.sort() + results2.sort() + if common.verbose: + print("Limits:", il, sl) + print("Length results:", len(results1)) + print("Should be:", len(results2)) + self.assertEqual(len(results1), len(results2)) + self.assertEqual(results1, results2) + + # Third selection + condition = '(il= sl] + # sort lists (indexing does not guarantee that rows are returned in + # order) + results1.sort() + results2.sort() + if common.verbose: + print("Limits:", il, sl) + print("Length results:", len(results1)) + print("Should be:", len(results2)) + self.assertEqual(len(results1), len(results2)) + self.assertEqual(results1, results2) + + def test10c(self): + """Checking indexed where() with ranges, changing step (string + flavor)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test10c..." % self.__class__.__name__) + + table1 = self.h5file.root.table1 + table2 = self.h5file.root.table2 + + # Convert the limits to the appropriate type + il = str(self.il).encode('ascii') + sl = str(self.sl).encode('ascii') + + # Do some selections and check the results + t1col = table1.cols.var1 + + # First selection + condition = 't1col>=sl' + self.assertTrue( + table1.will_query_use_indexing(condition) == + fzset([t1col.pathname])) + results1 = [p['var1'] for p in + table1.where(condition, start=2, stop=-1, step=3)] + results2 = [p["var1"] for p in table2.iterrows(2, -1, 3) + if p["var1"] >= sl] + # sort lists (indexing does not guarantee that rows are returned in + # order) + results1.sort() + results2.sort() + if common.verbose: + print("Limits:", il, sl) + print("Length results:", len(results1)) + print("Should be:", len(results2)) + self.assertEqual(len(results1), len(results2)) + self.assertEqual(results1, results2) + + # Second selection + condition = 't1col>=sl' + self.assertTrue( + table1.will_query_use_indexing(condition) == + fzset([t1col.pathname])) + results1 = [p['var1'] for p in + table1.where(condition, start=5, stop=-1, step=10)] + results2 = [p["var1"] for p in table2.iterrows(5, -1, 10) + if p["var1"] >= sl] + # sort lists (indexing does not guarantee that rows are returned in + # order) + results1.sort() + results2.sort() + if common.verbose: + print("Limits:", il, sl) + print("Length results:", len(results1)) + print("Should be:", len(results2)) + self.assertEqual(len(results1), len(results2)) + self.assertEqual(results1, results2) + + # Third selection + condition = 't1col>=sl' + self.assertTrue( + table1.will_query_use_indexing(condition) == + fzset([t1col.pathname])) + results1 = [p['var1'] for p in + table1.where(condition, start=5, stop=-3, step=11)] + results2 = [p["var1"] for p in table2.iterrows(5, -3, 11) + if p["var1"] >= sl] + # sort lists (indexing does not guarantee that rows are returned in + # order) + results1.sort() + results2.sort() + if common.verbose: + print("Limits:", il, sl) + print("Length results:", len(results1)) + print("Should be:", len(results2)) + self.assertEqual(len(results1), len(results2)) + self.assertEqual(results1, results2) + + # Fourth selection + condition = 't1col>=sl' + self.assertTrue( + table1.will_query_use_indexing(condition) == + fzset([t1col.pathname])) + results1 = [p['var1'] for p in + table1.where(condition, start=2, stop=-1, step=300)] + results2 = [p["var1"] for p in table2.iterrows(2, -1, 300) + if p["var1"] >= sl] + # sort lists (indexing does not guarantee that rows are returned in + # order) + results1.sort() + results2.sort() + if common.verbose: + print("Limits:", il, sl) + print("Length results:", len(results1)) + print("Should be:", len(results2)) + self.assertEqual(len(results1), len(results2)) + self.assertEqual(results1, results2) + + def test10d(self): + """Checking indexed where() with ranges, changing step (int flavor)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test10d..." % self.__class__.__name__) + + table1 = self.h5file.root.table1 + table2 = self.h5file.root.table2 + + # Convert the limits to the appropriate type + il = int(self.il) + sl = int(self.sl) + + # Do some selections and check the results + t3col = table1.cols.var3 + + # First selection + condition = 't3col>=sl' + self.assertTrue( + table1.will_query_use_indexing(condition) == + fzset([t3col.pathname])) + results1 = [p['var3'] for p in + table1.where(condition, start=2, stop=-1, step=3)] + results2 = [p["var3"] for p in table2.iterrows(2, -1, 3) + if p["var3"] >= sl] + # sort lists (indexing does not guarantee that rows are returned in + # order) + results1.sort() + results2.sort() + if common.verbose: + print("Limits:", il, sl) + print("Length results:", len(results1)) + print("Should be:", len(results2)) + self.assertEqual(len(results1), len(results2)) + self.assertEqual(results1, results2) + + # Second selection + condition = 't3col>=sl' + self.assertTrue( + table1.will_query_use_indexing(condition) == + fzset([t3col.pathname])) + results1 = [p['var3'] for p in + table1.where(condition, start=5, stop=-1, step=10)] + results2 = [p["var3"] for p in table2.iterrows(5, -1, 10) + if p["var3"] >= sl] + # sort lists (indexing does not guarantee that rows are returned in + # order) + results1.sort() + results2.sort() + if common.verbose: + print("Limits:", il, sl) + print("Length results:", len(results1)) + print("Should be:", len(results2)) + self.assertEqual(len(results1), len(results2)) + self.assertEqual(results1, results2) + + # Third selection + condition = 't3col>=sl' + self.assertTrue( + table1.will_query_use_indexing(condition) == + fzset([t3col.pathname])) + results1 = [p['var3'] for p in + table1.where(condition, start=5, stop=-3, step=11)] + results2 = [p["var3"] for p in table2.iterrows(5, -3, 11) + if p["var3"] >= sl] + # sort lists (indexing does not guarantee that rows are returned in + # order) + results1.sort() + results2.sort() + if common.verbose: + print("Limits:", il, sl) + print("Length results:", len(results1)) + print("Should be:", len(results2)) + self.assertEqual(len(results1), len(results2)) + self.assertEqual(results1, results2) + + # Fourth selection + condition = 't3col>=sl' + self.assertTrue( + table1.will_query_use_indexing(condition) == + fzset([t3col.pathname])) + results1 = [p['var3'] for p in + table1.where(condition, start=2, stop=-1, step=300)] + results2 = [p["var3"] for p in table2.iterrows(2, -1, 300) + if p["var3"] >= sl] + # sort lists (indexing does not guarantee that rows are returned in + # order) + results1.sort() + results2.sort() + if common.verbose: + print("Limits:", il, sl) + print("Length results:", len(results1)) + print("Should be:", len(results2)) + self.assertEqual(len(results1), len(results2)) + self.assertEqual(results1, results2) + + def test11a(self): + """Checking selecting values from an Index via read_coordinates()""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test11a..." % self.__class__.__name__) + + table1 = self.h5file.root.table1 + table2 = self.h5file.root.table2 + + # Convert the limits to the appropriate type + il = str(self.il).encode('ascii') + sl = str(self.sl).encode('ascii') + + # Do a selection and check the result + t1var1 = table1.cols.var1 + condition = '(il<=t1var1)&(t1var1<=sl)' + self.assertTrue( + table1.will_query_use_indexing(condition) == + fzset([t1var1.pathname]) + ) + coords1 = table1.get_where_list(condition) + table1.flavor = "python" + results1 = table1.read_coordinates(coords1, field="var1") + results2 = [p["var1"] for p in table2 + if il <= p["var1"] <= sl] + results1.sort() + results2.sort() + if common.verbose: + print("Length results:", len(results1)) + print("Should be:", len(results2)) + self.assertEqual(len(results1), len(results2)) + self.assertEqual(results1, results2) + + def test12a(self): + """Checking selecting values after a Table.append() operation.""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test12a..." % self.__class__.__name__) + + table1 = self.h5file.root.table1 + table2 = self.h5file.root.table2 + + # Append more rows in already created indexes + count = 0 + for i in range(0, self.nrows//2, self.nrep): + for j in range(self.nrep): + if self.random: + k = random.randrange(self.nrows) + elif self.values is not None: + lenvalues = len(self.values) + if i >= lenvalues: + i %= lenvalues + k = self.values[i] + else: + k = i + table1.row['var1'] = str(k) + table2.row['var1'] = str(k) + table1.row['var2'] = k % 2 + table2.row['var2'] = k % 2 + table1.row['var3'] = k + table2.row['var3'] = k + table1.row['var4'] = float(self.nrows - k - 1) + table2.row['var4'] = float(self.nrows - k - 1) + table1.row.append() + table2.row.append() + count += 1 + table1.flush() + table2.flush() + + t1var1 = table1.cols.var1 + t1var2 = table1.cols.var2 + t1var3 = table1.cols.var3 + t1var4 = table1.cols.var4 + self.assertFalse(t1var1.index.dirty) + self.assertFalse(t1var2.index.dirty) + self.assertFalse(t1var3.index.dirty) + self.assertFalse(t1var4.index.dirty) + + # Do some selections and check the results + # First selection: string + # Convert the limits to the appropriate type + il = str(self.il).encode('ascii') + sl = str(self.sl).encode('ascii') + + results1 = [p["var1"] for p in + table1.where('(il<=t1var1)&(t1var1<=sl)')] + results2 = [p["var1"] for p in table2 + if il <= p["var1"] <= sl] + results1.sort() + results2.sort() + if common.verbose: + print("Should look like:", results2) + print("Length results:", len(results1)) + print("Should be:", len(results2)) + self.assertEqual(len(results1), len(results2)) + self.assertEqual(results1, results2) + + # Second selection: bool + results1 = [p["var2"] for p in table1.where('t1var2 == True')] + results2 = [p["var2"] for p in table2 if p["var2"] is True] + if common.verbose: + print("Length results:", len(results1)) + print("Should be:", len(results2)) + self.assertEqual(len(results1), len(results2)) + self.assertEqual(results1, results2) + + # Third selection: int + # Convert the limits to the appropriate type + il = int(self.il) + sl = int(self.sl) + + t1var3 = table1.cols.var3 + results1 = [p["var3"] for p in table1.where( + '(il<=t1var3)&(t1var3<=sl)')] + results2 = [p["var3"] for p in table2 + if il <= p["var3"] <= sl] + # sort lists (indexing does not guarantee that rows are returned in + # order) + results1.sort() + results2.sort() + if common.verbose: + print("Length results:", len(results1)) + print("Should be:", len(results2)) + self.assertEqual(len(results1), len(results2)) + self.assertEqual(results1, results2) + + # Fourth selection: float + # Convert the limits to the appropriate type + il = float(self.il) + sl = float(self.sl) + + # Do some selections and check the results + results1 = [p["var4"] for p in table1.where( + '(il<=t1var4)&(t1var4<=sl)')] + results2 = [p["var4"] for p in table2 + if il <= p["var4"] <= sl] + # sort lists (indexing does not guarantee that rows are returned in + # order) + results1.sort() + results2.sort() + if common.verbose: + print("Length results:", len(results1)) + print("Should be:", len(results2)) + self.assertEqual(len(results1), len(results2)) + self.assertEqual(results1.sort(), results2.sort()) + + def test13a(self): + """Checking repeated queries (checking caches)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test13a..." % self.__class__.__name__) + + table1 = self.h5file.root.table1 + table2 = self.h5file.root.table2 + + # Convert the limits to the appropriate type + il = str(self.il).encode('ascii') + sl = str(self.sl).encode('ascii') + + # Do some selections and check the results + t1col = table1.cols.var1 + condition = '(il<=t1col)&(t1col<=sl)' + self.assertTrue( + table1.will_query_use_indexing(condition) == + fzset([t1col.pathname])) + results1 = [ + p['var1'] for p in table1.where(condition, start=2, stop=30, + step=1) + ] + results2 = [ + p["var1"] for p in table2.iterrows(2, 30, 1) + if il <= p["var1"] <= sl + ] + # sort lists (indexing does not guarantee that rows are returned in + # order) + results1.sort() + results2.sort() + if common.verbose: + print("Limits:", il, sl) + print("Length results:", len(results1)) + print("Should be:", len(results2)) + self.assertEqual(len(results1), len(results2)) + self.assertEqual(results1, results2) + + # Repeat the selection (testing caches) + condition = '(il<=t1col)&(t1col<=sl)' + self.assertTrue( + table1.will_query_use_indexing(condition) == + fzset([t1col.pathname])) + results1 = [ + p['var1'] for p in table1.where(condition, start=2, stop=30, + step=2) + ] + results2 = [ + p["var1"] for p in table2.iterrows(2, 30, 2) + if il <= p["var1"] <= sl + ] + # sort lists (indexing does not guarantee that rows are returned in + # order) + results1.sort() + results2.sort() + if common.verbose: + print("Limits:", il, sl) + print("Length results:", len(results1)) + print("Should be:", len(results2)) + self.assertEqual(len(results1), len(results2)) + self.assertEqual(results1, results2) + + def test13b(self): + """Checking repeated queries, varying step (checking caches)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test13b..." % self.__class__.__name__) + + table1 = self.h5file.root.table1 + table2 = self.h5file.root.table2 + + # Convert the limits to the appropriate type + il = str(self.il).encode('ascii') + sl = str(self.sl).encode('ascii') + + # Do some selections and check the results + t1col = table1.cols.var1 + condition = '(il<=t1col)&(t1col<=sl)' + self.assertTrue( + table1.will_query_use_indexing(condition) == + fzset([t1col.pathname])) + results1 = [ + p['var1'] for p in table1.where(condition, start=2, stop=30, + step=1) + ] + results2 = [ + p["var1"] for p in table2.iterrows(2, 30, 1) + if il <= p["var1"] <= sl + ] + # sort lists (indexing does not guarantee that rows are returned in + # order) + results1.sort() + results2.sort() + if common.verbose: + print("Limits:", il, sl) + print("Length results:", len(results1)) + print("Should be:", len(results2)) + self.assertEqual(len(results1), len(results2)) + self.assertEqual(results1, results2) + + # Repeat the selection (testing caches) + condition = '(il<=t1col)&(t1col<=sl)' + self.assertTrue( + table1.will_query_use_indexing(condition) == + fzset([t1col.pathname])) + results1 = [ + p['var1'] for p in table1.where(condition, start=2, stop=30, + step=2) + ] + results2 = [ + p["var1"] for p in table2.iterrows(2, 30, 2) + if il <= p["var1"] <= sl + ] + # sort lists (indexing does not guarantee that rows are returned in + # order) + results1.sort() + results2.sort() + if common.verbose: + print("Limits:", il, sl) + print("Length results:", len(results1)) + print("Should be:", len(results2)) + self.assertEqual(len(results1), len(results2)) + self.assertEqual(results1, results2) + + def test13c(self): + """Checking repeated queries, varying start, stop, step.""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test13c..." % self.__class__.__name__) + + table1 = self.h5file.root.table1 + table2 = self.h5file.root.table2 + + # Convert the limits to the appropriate type + il = str(self.il).encode('ascii') + sl = str(self.sl).encode('ascii') + + # Do some selections and check the results + t1col = table1.cols.var1 + condition = '(il<=t1col)&(t1col<=sl)' + self.assertTrue( + table1.will_query_use_indexing(condition) == + fzset([t1col.pathname])) + results1 = [ + p['var1'] for p in table1.where(condition, start=0, stop=1, step=2) + ] + results2 = [ + p["var1"] for p in table2.iterrows(0, 1, 2) + if il <= p["var1"] <= sl + ] + # sort lists (indexing does not guarantee that rows are returned in + # order) + results1.sort() + results2.sort() + if common.verbose: + print("Limits:", il, sl) + print("Length results:", len(results1)) + print("Should be:", len(results2)) + self.assertEqual(len(results1), len(results2)) + self.assertEqual(results1, results2) + + # Repeat the selection (testing caches) + condition = '(il<=t1col)&(t1col<=sl)' + self.assertTrue( + table1.will_query_use_indexing(condition) == + fzset([t1col.pathname])) + results1 = [ + p['var1'] for p in table1.where(condition, start=0, stop=5, step=1) + ] + results2 = [ + p["var1"] for p in table2.iterrows(0, 5, 1) + if il <= p["var1"] <= sl + ] + # sort lists (indexing does not guarantee that rows are returned in + # order) + results1.sort() + results2.sort() + if common.verbose: + print("Limits:", il, sl) + print("Length results:", len(results1)) + print("Should be:", len(results2)) + self.assertEqual(len(results1), len(results2)) + self.assertEqual(results1, results2) + + def test13d(self): + """Checking repeated queries, varying start, stop, step (another + twist)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test13d..." % self.__class__.__name__) + + table1 = self.h5file.root.table1 + table2 = self.h5file.root.table2 + + # Convert the limits to the appropriate type + il = str(self.il).encode('ascii') + sl = str(self.sl).encode('ascii') + + # Do some selections and check the results + t1col = table1.cols.var1 + condition = '(il<=t1col)&(t1col<=sl)' + self.assertTrue( + table1.will_query_use_indexing(condition) == + fzset([t1col.pathname]) + ) + results1 = [ + p['var1'] for p in table1.where(condition, start=0, stop=1, step=1) + ] + results2 = [ + p["var1"] for p in table2.iterrows(0, 1, 1) + if il <= p["var1"] <= sl + ] + # sort lists (indexing does not guarantee that rows are returned in + # order) + results1.sort() + results2.sort() + if common.verbose: + print("Limits:", il, sl) + print("Length results:", len(results1)) + print("Should be:", len(results2)) + self.assertEqual(len(results1), len(results2)) + self.assertEqual(results1, results2) + + # Repeat the selection (testing caches) + condition = '(il<=t1col)&(t1col<=sl)' + self.assertTrue( + table1.will_query_use_indexing(condition) == + fzset([t1col.pathname])) + results1 = [ + p['var1'] for p in table1.where(condition, start=0, stop=1, step=1) + ] + results2 = [ + p["var1"] for p in table2.iterrows(0, 1, 1) + if il <= p["var1"] <= sl + ] + # sort lists (indexing does not guarantee that rows are returned in + # order) + results1.sort() + results2.sort() + if common.verbose: + print("Limits:", il, sl) + print("Length results:", len(results1)) + print("Should be:", len(results2)) + self.assertEqual(len(results1), len(results2)) + self.assertEqual(results1, results2) + + def test13e(self): + """Checking repeated queries, with varying condition.""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test13e..." % self.__class__.__name__) + + table1 = self.h5file.root.table1 + table2 = self.h5file.root.table2 + + # Convert the limits to the appropriate type + il = str(self.il).encode('ascii') + sl = str(self.sl).encode('ascii') + + # Do some selections and check the results + t1col = table1.cols.var1 + condition = '(il<=t1col)&(t1col<=sl)' + self.assertTrue( + table1.will_query_use_indexing(condition) == + fzset([t1col.pathname])) + results1 = [ + p['var1'] for p in table1.where(condition, start=0, stop=10, + step=1) + ] + results2 = [ + p["var1"] for p in table2.iterrows(0, 10, 1) + if il <= p["var1"] <= sl + ] + # sort lists (indexing does not guarantee that rows are returned in + # order) + results1.sort() + results2.sort() + if common.verbose: + print("Limits:", il, sl) + print("Length results:", len(results1)) + print("Should be:", len(results2)) + self.assertEqual(len(results1), len(results2)) + self.assertEqual(results1, results2) + + # Repeat the selection with a more complex condition + t2col = table1.cols.var2 + condition = '(il<=t1col)&(t1col<=sl)&(t2col==True)' + self.assertTrue( + table1.will_query_use_indexing(condition) == + fzset([t1col.pathname, t2col.pathname])) + results1 = [ + p['var1'] for p in + table1.where(condition, start=0, stop=10, step=1) + ] + results2 = [ + p["var1"] for p in table2.iterrows(0, 10, 1) + if il <= p["var1"] <= sl and p["var2"] is True + ] + # sort lists (indexing does not guarantee that rows are returned in + # order) + results1.sort() + results2.sort() + if common.verbose: + print("Limits:", il, sl) + print("Length results:", len(results1)) + print("Should be:", len(results2)) + self.assertEqual(len(results1), len(results2)) + self.assertEqual(results1, results2) + + def test13f(self): + """Checking repeated queries, with varying condition.""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test13f..." % self.__class__.__name__) + + table1 = self.h5file.root.table1 + table2 = self.h5file.root.table2 + + # Remove indexes in var2 column + table1.cols.var2.remove_index() + table2.cols.var2.remove_index() + + # Convert the limits to the appropriate type + il = str(self.il).encode('ascii') + sl = str(self.sl).encode('ascii') + + # Do some selections and check the results + t1col = table1.cols.var1 + t2col = table1.cols.var2 + self.assertIsNotNone(t2col) + condition = '(il<=t1col)&(t1col<=sl)&(t2col==True)' + self.assertTrue( + table1.will_query_use_indexing(condition) == + fzset([t1col.pathname])) + results1 = [p['var1'] for p in + table1.where(condition, start=0, stop=10, step=1)] + results2 = [ + p["var1"] for p in table2.iterrows(0, 10, 1) + if il <= p["var1"] <= sl and p["var2"] is True + ] + # sort lists (indexing does not guarantee that rows are returned in + # order) + results1.sort() + results2.sort() + if common.verbose: + print("Limits:", il, sl) + print("Length results:", len(results1)) + print("Should be:", len(results2)) + self.assertEqual(len(results1), len(results2)) + self.assertEqual(results1, results2) + + # Repeat the selection with a simpler condition + condition = '(il<=t1col)&(t1col<=sl)' + self.assertTrue( + table1.will_query_use_indexing(condition) == + fzset([t1col.pathname])) + results1 = [p['var1'] for p in + table1.where(condition, start=0, stop=10, step=1)] + results2 = [p["var1"] for p in table2.iterrows(0, 10, 1) + if il <= p["var1"] <= sl] + # sort lists (indexing does not guarantee that rows are returned in + # order) + results1.sort() + results2.sort() + if common.verbose: + print("Limits:", il, sl) + print("Length results:", len(results1)) + print("Should be:", len(results2)) + self.assertEqual(len(results1), len(results2)) + self.assertEqual(results1, results2) + + # Repeat again with the original condition, but with a constant + constant = True + condition = '(il<=t1col)&(t1col<=sl)&(t2col==constant)' + self.assertTrue( + table1.will_query_use_indexing(condition) == + fzset([t1col.pathname])) + results1 = [p['var1'] for p in + table1.where(condition, start=0, stop=10, step=1)] + results2 = [p["var1"] for p in table2.iterrows(0, 10, 1) + if il <= p["var1"] <= sl and p["var2"] == constant] + # sort lists (indexing does not guarantee that rows are returned in + # order) + results1.sort() + results2.sort() + if common.verbose: + print("Limits:", il, sl) + print("Length results:", len(results1)) + print("Should be:", len(results2)) + self.assertEqual(len(results1), len(results2)) + self.assertEqual(results1, results2) + + def test13g(self): + """Checking repeated queries, with different limits.""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test13g..." % self.__class__.__name__) + + table1 = self.h5file.root.table1 + table2 = self.h5file.root.table2 + + # Convert the limits to the appropriate type + il = str(self.il).encode('ascii') + sl = str(self.sl).encode('ascii') + + # Do some selections and check the results + t1col = table1.cols.var1 + condition = '(il<=t1col)&(t1col<=sl)' + self.assertTrue( + table1.will_query_use_indexing(condition) == + fzset([t1col.pathname])) + results1 = [p['var1'] for p in + table1.where(condition, start=0, stop=10, step=1)] + results2 = [p["var1"] for p in table2.iterrows(0, 10, 1) + if il <= p["var1"] <= sl] + # sort lists (indexing does not guarantee that rows are returned in + # order) + results1.sort() + results2.sort() + if common.verbose: + print("Limits:", il, sl) + print("Length results:", len(results1)) + print("Should be:", len(results2)) + self.assertEqual(len(results1), len(results2)) + self.assertEqual(results1, results2) + + # Repeat the selection with different limits + il, sl = (str(self.il + 1).encode( + 'ascii'), str(self.sl-2).encode('ascii')) + t2col = table1.cols.var2 + self.assertIsNotNone(t2col) + condition = '(il<=t1col)&(t1col<=sl)' + self.assertTrue( + table1.will_query_use_indexing(condition) == + fzset([t1col.pathname])) + results1 = [p['var1'] for p in + table1.where(condition, start=0, stop=10, step=1)] + results2 = [p["var1"] for p in table2.iterrows(0, 10, 1) + if il <= p["var1"] <= sl] + # sort lists (indexing does not guarantee that rows are returned in + # order) + results1.sort() + results2.sort() + if common.verbose: + print("Limits:", il, sl) + print("Length results:", len(results1)) + print("Should be:", len(results2)) + self.assertEqual(len(results1), len(results2)) + self.assertEqual(results1, results2) + + +class SV1aTestCase(SelectValuesTestCase): + blocksizes = small_blocksizes + chunkshape = 1 + buffersize = 2 + ss = blocksizes[2] + nrows = ss + reopen = 0 + nrep = ss + il = 0 + sl = ss + + +class SV1bTestCase(SV1aTestCase): + blocksizes = tb.idxutils.calc_chunksize(minRowIndex, memlevel=1) + chunkshape = blocksizes[2]//2**9 + buffersize = chunkshape * 5 + + +class SV2aTestCase(SelectValuesTestCase): + blocksizes = small_blocksizes + chunkshape = 2 + buffersize = 2 + ss = blocksizes[2] + nrows = ss * 2-1 + reopen = 1 + nrep = 1 + il = 0 + sl = 2 + + +class SV2bTestCase(SV2aTestCase): + blocksizes = tb.idxutils.calc_chunksize(minRowIndex, memlevel=1) + chunkshape = blocksizes[2]//2**7 + buffersize = chunkshape * 20 + + +class SV3aTestCase(SelectValuesTestCase): + blocksizes = small_blocksizes + chunkshape = 2 + buffersize = 3 + ss = blocksizes[2] + nrows = ss * 5-1 + reopen = 1 + nrep = 3 + il = 0 + sl = 3 + + +class SV3bTestCase(SV3aTestCase): + blocksizes = tb.idxutils.calc_chunksize(minRowIndex, memlevel=1) +# chunkshape = 4 +# buffersize = 16 + chunkshape = 3 + buffersize = 9 + + +class SV4aTestCase(SelectValuesTestCase): + blocksizes = small_blocksizes + buffersize = 10 + ss = blocksizes[2] + nrows = ss * 3 + reopen = 0 + nrep = 1 + # il = nrows-cs + il = 0 + sl = nrows + + +class SV4bTestCase(SV4aTestCase): + blocksizes = tb.idxutils.calc_chunksize(minRowIndex, memlevel=1) + chunkshape = 500 + buffersize = 1000 + + +class SV5aTestCase(SelectValuesTestCase): + blocksizes = small_blocksizes + ss = blocksizes[2] + nrows = ss * 5 + reopen = 0 + nrep = 1 + il = 0 + sl = nrows + + +class SV5bTestCase(SV5aTestCase): + blocksizes = tb.idxutils.calc_chunksize(minRowIndex, memlevel=1) + + +class SV6aTestCase(SelectValuesTestCase): + blocksizes = small_blocksizes + ss = blocksizes[2] + nrows = ss * 5 + 1 + reopen = 0 + cs = blocksizes[3] + nrep = cs + 1 + il = -1 + sl = nrows + + +class SV6bTestCase(SV6aTestCase): + blocksizes = tb.idxutils.calc_chunksize(minRowIndex, memlevel=1) + + +class SV7aTestCase(SelectValuesTestCase): + random = 1 + blocksizes = small_blocksizes + ss = blocksizes[2] + nrows = ss * 5 + 3 + reopen = 0 + cs = blocksizes[3] + nrep = cs-1 + il = -10 + sl = nrows + + +class SV7bTestCase(SV7aTestCase): + blocksizes = tb.idxutils.calc_chunksize(minRowIndex, memlevel=1) + + +class SV8aTestCase(SelectValuesTestCase): + random = 0 + chunkshape = 1 + blocksizes = small_blocksizes + ss = blocksizes[2] + nrows = ss * 5-3 + reopen = 0 + cs = blocksizes[3] + nrep = cs-1 + il = 10 + sl = nrows-10 + + +class SV8bTestCase(SV8aTestCase): + random = 0 + blocksizes = tb.idxutils.calc_chunksize(minRowIndex, memlevel=1) + + +class SV9aTestCase(SelectValuesTestCase): + random = 1 + blocksizes = small_blocksizes + ss = blocksizes[2] + nrows = ss * 5 + 11 + reopen = 0 + cs = blocksizes[3] + nrep = cs-1 + il = 10 + sl = nrows-10 + + +class SV9bTestCase(SV9aTestCase): + blocksizes = tb.idxutils.calc_chunksize(minRowIndex, memlevel=1) + + +class SV10aTestCase(SelectValuesTestCase): + random = 1 + blocksizes = small_blocksizes + chunkshape = 1 + buffersize = 1 + ss = blocksizes[2] + nrows = ss + reopen = 0 + nrep = ss + il = 0 + sl = ss + + +class SV10bTestCase(SV10aTestCase): + blocksizes = tb.idxutils.calc_chunksize(minRowIndex, memlevel=1) + chunkshape = 5 + buffersize = 6 + + +class SV11aTestCase(SelectValuesTestCase): + # This checks a special case that failed. It was discovered in a + # random test above (SV10a). It is explicitely put here as a way + # to always check that specific case. + values = [1, 7, 6, 7, 0, 7, 4, 4, 9, 5] + blocksizes = small_blocksizes + chunkshape = 1 + buffersize = 1 + ss = blocksizes[2] + nrows = ss + reopen = 0 + nrep = ss + il = 0 + sl = ss + + +class SV11bTestCase(SelectValuesTestCase): + # This checks a special case that failed. It was discovered in a + # random test above (SV10a). It is explicitely put here as a way + # to always check that specific case. + values = [1, 7, 6, 7, 0, 7, 4, 4, 9, 5] + chunkshape = 2 + buffersize = 2 + blocksizes = tb.idxutils.calc_chunksize(minRowIndex, memlevel=1) + ss = blocksizes[2] + nrows = ss + reopen = 0 + nrep = ss + il = 0 + sl = ss + + +class SV12aTestCase(SelectValuesTestCase): + # This checks a special case that failed. It was discovered in a + # random test above (SV10b). It is explicitely put here as a way + # to always check that specific case. + # values = [0, 7, 0, 6, 5, 1, 6, 7, 0, 0] + values = [4, 4, 1, 5, 2, 0, 1, 4, 3, 9] + blocksizes = small_blocksizes + chunkshape = 1 + buffersize = 1 + ss = blocksizes[2] + nrows = ss + reopen = 0 + nrep = ss + il = 0 + sl = ss + + +class SV12bTestCase(SelectValuesTestCase): + # This checks a special case that failed. It was discovered in a + # random test above (SV10b). It is explicitely put here as a way + # to always check that specific case. + # values = [0, 7, 0, 6, 5, 1, 6, 7, 0, 0] + values = [4, 4, 1, 5, 2, 0, 1, 4, 3, 9] + blocksizes = tb.idxutils.calc_chunksize(minRowIndex, memlevel=1) + chunkshape = 2 + buffersize = 2 + ss = blocksizes[2] + nrows = ss + reopen = 1 + nrep = ss + il = 0 + sl = ss + + +class SV13aTestCase(SelectValuesTestCase): + values = [0, 7, 0, 6, 5, 1, 6, 7, 0, 0] + blocksizes = small_blocksizes + chunkshape = 3 + buffersize = 5 + ss = blocksizes[2] + nrows = ss + reopen = 0 + nrep = ss + il = 0 + sl = ss + + +class SV13bTestCase(SelectValuesTestCase): + values = [0, 7, 0, 6, 5, 1, 6, 7, 0, 0] + blocksizes = tb.idxutils.calc_chunksize(minRowIndex, memlevel=1) + chunkshape = 5 + buffersize = 10 + ss = blocksizes[2] + nrows = ss + reopen = 1 + nrep = ss + il = 0 + sl = ss + + +class SV14aTestCase(SelectValuesTestCase): + values = [1, 7, 6, 7, 0, 7, 4, 4, 9, 5] + blocksizes = small_blocksizes + chunkshape = 2 + buffersize = 5 + ss = blocksizes[2] + nrows = ss + reopen = 0 + cs = blocksizes[3] + nrep = cs + il = -5 + sl = 500 + + +class SV14bTestCase(SelectValuesTestCase): + values = [1, 7, 6, 7, 0, 7, 4, 4, 9, 5] + blocksizes = tb.idxutils.calc_chunksize(minRowIndex, memlevel=1) + chunkshape = 9 + buffersize = 10 + ss = blocksizes[2] + nrows = ss + reopen = 1 + nrep = 9 + il = 0 + cs = blocksizes[3] + sl = ss-cs + 1 + + +class SV15aTestCase(SelectValuesTestCase): + # Test that checks for case where there are not valid values in + # the indexed part, but they exist in the non-indexed region. + # At least, test01b takes account of that + random = 1 + # Both values of seed below triggers a fail in indexing code + # seed = 1885 + seed = 183 + blocksizes = small_blocksizes + ss = blocksizes[2] + nrows = ss * 5 + 1 + reopen = 0 + cs = blocksizes[3] + nrep = cs-1 + il = -10 + sl = nrows + + +class SV15bTestCase(SelectValuesTestCase): + # Test that checks for case where there are not valid values in + # the indexed part, but they exist in the non-indexed region. + # At least, test01b takes account of that + random = 1 + # Both values of seed below triggers a fail in indexing code + seed = 1885 + # seed = 183 + blocksizes = tb.idxutils.calc_chunksize(minRowIndex, memlevel=1) + ss = blocksizes[2] + nrows = ss * 5 + 1 + reopen = 1 + cs = blocksizes[3] + nrep = cs-1 + il = -10 + sl = nrows + + +class LastRowReuseBuffers(common.PyTablesTestCase): + # Test that checks for possible reuse of buffers coming + # from last row in the sorted part of indexes + nelem = 1221 + np.random.seed(1) + random.seed(1) + + class Record(tb.IsDescription): + id1 = tb.Int16Col() + + def setUp(self): + super().setUp() + self.h5fname = tempfile.mktemp(".h5") + self.h5file = None + + def tearDown(self): + if self.h5file is not None: + self.h5file.close() + if Path(self.h5fname).is_file(): + Path(self.h5fname).unlink() + super().tearDown() + + def test00_lrucache(self): + self.h5file = tb.open_file(self.h5fname, 'w', node_cache_slots=64) + ta = self.h5file.create_table('/', 'table', self.Record, + filters=tb.Filters(1)) + id1 = np.random.randint(0, 2**15, self.nelem) + ta.append([id1]) + + ta.cols.id1.create_index() + + for i in range(self.nelem): + nrow = random.randrange(self.nelem) + value = id1[nrow] + idx = ta.get_where_list('id1 == %s' % value) + self.assertGreater(len(idx), 0, f"idx--> {idx} {i} {nrow} {value}") + self.assertTrue( + nrow in idx, + f"nrow not found: {idx} != {nrow}, {value}") + + def test01_nocache(self): + self.h5file = tb.open_file(self.h5fname, 'w', node_cache_slots=0) + ta = self.h5file.create_table('/', 'table', self.Record, + filters=tb.Filters(1)) + id1 = np.random.randint(0, 2**15, self.nelem) + ta.append([id1]) + + ta.cols.id1.create_index() + + for i in range(self.nelem): + nrow = random.randrange(self.nelem) + value = id1[nrow] + idx = ta.get_where_list('id1 == %s' % value) + self.assertGreater(len(idx), 0, f"idx--> {idx} {i} {nrow} {value}") + self.assertTrue( + nrow in idx, + f"nrow not found: {idx} != {nrow}, {value}") + + def test02_dictcache(self): + self.h5file = tb.open_file(self.h5fname, 'w', node_cache_slots=-64) + ta = self.h5file.create_table('/', 'table', self.Record, + filters=tb.Filters(1)) + id1 = np.random.randint(0, 2**15, self.nelem) + ta.append([id1]) + + ta.cols.id1.create_index() + + for i in range(self.nelem): + nrow = random.randrange(self.nelem) + value = id1[nrow] + idx = ta.get_where_list('id1 == %s' % value) + self.assertGreater(len(idx), 0, f"idx--> {idx} {i} {nrow} {value}") + self.assertTrue( + nrow in idx, + f"nrow not found: {idx} != {nrow}, {value}") + + +normal_tests = ( + "SV1aTestCase", "SV2aTestCase", "SV3aTestCase", +) + +heavy_tests = ( + # The next are too hard to be in the 'normal' suite + "SV1bTestCase", "SV2bTestCase", "SV3bTestCase", + "SV4aTestCase", "SV5aTestCase", "SV6aTestCase", + "SV7aTestCase", "SV8aTestCase", "SV9aTestCase", + "SV10aTestCase", "SV11aTestCase", "SV12aTestCase", + "SV13aTestCase", "SV14aTestCase", "SV15aTestCase", + # This are properly heavy + "SV4bTestCase", "SV5bTestCase", "SV6bTestCase", + "SV7bTestCase", "SV8bTestCase", "SV9bTestCase", + "SV10bTestCase", "SV11bTestCase", "SV12bTestCase", + "SV13bTestCase", "SV14bTestCase", "SV15bTestCase", + ) + + +# Base classes for the different type indexes. +class UltraLightITableMixin: + kind = "ultralight" + + +class LightITableMixin: + kind = "light" + + +class MediumITableMixin: + kind = "medium" + + +class FullITableMixin: + kind = "full" + + +# Parameters for indexed queries. +ckinds = ['UltraLight', 'Light', 'Medium', 'Full'] +testlevels = ['Normal', 'Heavy'] + +# Indexed queries: ``[ULMF]I[NH]SVXYTestCase``, where: +# +# 1. U is for 'UltraLight', L for 'Light', M for 'Medium', F for 'Full' indexes +# 2. N is for 'Normal', H for 'Heavy' tests + + +def iclassdata(): + for ckind in ckinds: + for ctest in normal_tests + heavy_tests: + classname = f'{ckind[0]}I{testlevels[common.heavy][0]}{ctest}' + # Uncomment the next one and comment the past one if one + # don't want to include the methods (testing purposes only) + # cbasenames = ( '%sITableMixin' % ckind, "object") + cbasenames = ('%sITableMixin' % ckind, ctest) + classdict = dict(heavy=bool(ctest in heavy_tests)) + yield (classname, cbasenames, classdict) + + +# Create test classes. +for (cname, cbasenames, cdict) in iclassdata(): + cbases = tuple(eval(cbase) for cbase in cbasenames) + class_ = type(cname, cbases, cdict) + exec('%s = class_' % cname) + + +# Test case for issue #319 +class BuffersizeMultipleChunksize(common.TempFileMixin, + common.PyTablesTestCase): + open_mode = 'w' + + def test01(self): + np.random.seed(2) + n = 700_000 + cs = 50_000 + nchunks = n // cs + + arr = np.zeros( + (n,), dtype=[('index', 'i8'), ('o', 'i8'), ('value', 'f8')]) + arr['index'] = np.arange(n) + arr['o'] = np.random.randint(-20_000, -15_000, size=n) + arr['value'] = np.random.randn(n) + + node = self.h5file.create_group('/', 'foo') + table = self.h5file.create_table(node, 'table', dict( + index=tb.Int64Col(), + o=tb.Int64Col(), + value=tb.FloatCol(shape=())), expectedrows=10_000_000) + + table.append(arr) + + self._reopen('a') + + v1 = np.unique(arr['o'])[0] + v2 = np.unique(arr['o'])[1] + res = np.array([v1, v2]) + selector = f'((o == {v1}) | (o == {v2}))' + if common.verbose: + print("selecting values: %s" % selector) + + table = self.h5file.root.foo.table + + result = np.unique(table.read_where(selector)['o']) + np.testing.assert_almost_equal(result, res) + if common.verbose: + print("select entire table:") + print(f"result: {result}\texpected: {res}") + + if common.verbose: + print("index the column o") + table.cols.o.create_index() # this was triggering the issue + + if common.verbose: + print("select via chunks") + for i in range(nchunks): + result = table.read_where(selector, start=i*cs, stop=(i+1)*cs) + result = np.unique(result['o']) + np.testing.assert_almost_equal(np.unique(result), res) + if common.verbose: + print(f"result: {result}\texpected: {res}") + + +# Test case for issue #441 +class SideEffectNumPyQuicksort(common.PyTablesTestCase): + + def test01(self): + bug_file = common.test_filename("bug-idx.h5") + tmp_file = tempfile.mktemp(".h5") + tb.copy_file(bug_file, tmp_file) + h5 = tb.open_file(tmp_file, "a") + o = h5.root.table + vals = o.cols.path[:] + npvals = set(np.where(vals == 6)[0]) + + # Setting the chunkshape is critical for reproducing the bug + t = o.copy(newname="table2", chunkshape=2730) + t.cols.path.create_index() + indexed = {r.nrow for r in t.where('path == 6')} + + if common.verbose: + diffs = sorted(npvals - indexed) + print("ndiff:", len(diffs), diffs) + self.assertEqual(len(npvals), len(indexed)) + + h5.close() + if Path(tmp_file).is_file(): + Path(tmp_file).unlink() + + +# ----------------------------- + + +def suite(): + theSuite = common.unittest.TestSuite() + + niter = 1 + + for n in range(niter): + for cdata in iclassdata(): + class_ = eval(cdata[0]) + if not class_.heavy: + suite_ = common.unittest.makeSuite(class_) + theSuite.addTest(suite_) + elif common.heavy: + suite_ = common.unittest.makeSuite(class_) + theSuite.addTest(suite_) + theSuite.addTest(common.unittest.makeSuite(LastRowReuseBuffers)) + theSuite.addTest( + common.unittest.makeSuite(BuffersizeMultipleChunksize)) + theSuite.addTest(common.unittest.makeSuite(SideEffectNumPyQuicksort)) + return theSuite + + +if __name__ == '__main__': + import sys + common.parse_argv(sys.argv) + common.print_versions() + common.unittest.main(defaultTest='suite') diff --git a/tables/tests/test_links.py b/tables/tests/test_links.py new file mode 100644 index 0000000..b8eb122 --- /dev/null +++ b/tables/tests/test_links.py @@ -0,0 +1,597 @@ +"""Test module for diferent kind of links under PyTables.""" + +import re +import tempfile +from pathlib import Path + +import tables as tb +from tables.tests import common + + +# Test for hard links +class HardLinkTestCase(common.TempFileMixin, common.PyTablesTestCase): + + def setUp(self): + super().setUp() + self._createFile() + + def _createFile(self): + self.h5file.create_array('/', 'arr1', [1, 2]) + group1 = self.h5file.create_group('/', 'group1') + arr2 = self.h5file.create_array(group1, 'arr2', [1, 2, 3]) + lgroup1 = self.h5file.create_hard_link('/', 'lgroup1', '/group1') + self.assertIsNotNone(lgroup1) + larr1 = self.h5file.create_hard_link(group1, 'larr1', '/arr1') + self.assertIsNotNone(larr1) + larr2 = self.h5file.create_hard_link('/', 'larr2', arr2) + self.assertIsNotNone(larr2) + + def test00_create(self): + """Creating hard links.""" + + self._checkEqualityGroup(self.h5file.root.group1, + self.h5file.root.lgroup1, + hardlink=True) + self._checkEqualityLeaf(self.h5file.root.arr1, + self.h5file.root.group1.larr1, + hardlink=True) + self._checkEqualityLeaf(self.h5file.root.lgroup1.arr2, + self.h5file.root.larr2, + hardlink=True) + + def test01_open(self): + """Opening a file with hard links.""" + + self._reopen() + self._checkEqualityGroup(self.h5file.root.group1, + self.h5file.root.lgroup1, + hardlink=True) + self._checkEqualityLeaf(self.h5file.root.arr1, + self.h5file.root.group1.larr1, + hardlink=True) + self._checkEqualityLeaf(self.h5file.root.lgroup1.arr2, + self.h5file.root.larr2, + hardlink=True) + + def test02_removeLeaf(self): + """Removing a hard link to a Leaf.""" + + # First delete the initial link + self.h5file.root.arr1.remove() + self.assertNotIn('/arr1', self.h5file) + # The second link should still be there + if common.verbose: + print("Remaining link:", self.h5file.root.group1.larr1) + self.assertIn('/group1/larr1', self.h5file) + # Remove the second link + self.h5file.root.group1.larr1.remove() + self.assertNotIn('/group1/larr1', self.h5file) + + def test03_removeGroup(self): + """Removing a hard link to a Group.""" + + if common.verbose: + print("Original object tree:", self.h5file) + # First delete the initial link + self.h5file.root.group1._f_remove(force=True) + self.assertNotIn('/group1', self.h5file) + # The second link should still be there + if common.verbose: + print("Remaining link:", self.h5file.root.lgroup1) + print("Object tree:", self.h5file) + self.assertIn('/lgroup1', self.h5file) + # Remove the second link + self.h5file.root.lgroup1._g_remove(recursive=True) + self.assertNotIn('/lgroup1', self.h5file) + if common.verbose: + print("Final object tree:", self.h5file) + + +# Test for soft links +class SoftLinkTestCase(common.TempFileMixin, common.PyTablesTestCase): + + def setUp(self): + super().setUp() + self._createFile() + + def _createFile(self): + self.h5file.create_array('/', 'arr1', [1, 2]) + group1 = self.h5file.create_group('/', 'group1') + arr2 = self.h5file.create_array(group1, 'arr2', [1, 2, 3]) + lgroup1 = self.h5file.create_soft_link('/', 'lgroup1', '/group1') + self.assertIsNotNone(lgroup1) + larr1 = self.h5file.create_soft_link(group1, 'larr1', '/arr1') + self.assertIsNotNone(larr1) + larr2 = self.h5file.create_soft_link('/', 'larr2', arr2) + self.assertIsNotNone(larr2) + + def test00_create(self): + """Creating soft links.""" + + self._checkEqualityGroup(self.h5file.root.group1, + self.h5file.root.lgroup1()) + self._checkEqualityLeaf(self.h5file.root.arr1, + self.h5file.root.group1.larr1()) + self._checkEqualityLeaf(self.h5file.root.lgroup1().arr2, + self.h5file.root.larr2()) + + def test01_open(self): + """Opening a file with soft links.""" + + self._reopen() + self._checkEqualityGroup(self.h5file.root.group1, + self.h5file.root.lgroup1()) + self._checkEqualityLeaf(self.h5file.root.arr1, + self.h5file.root.group1.larr1()) + self._checkEqualityLeaf(self.h5file.root.lgroup1().arr2, + self.h5file.root.larr2()) + + def test02_remove(self): + """Removing a soft link.""" + + # First delete the referred link + self.h5file.root.arr1.remove() + self.assertNotIn('/arr1', self.h5file) + # The soft link should still be there (but dangling) + if common.verbose: + print("Dangling link:", self.h5file.root.group1.larr1) + self.assertIn('/group1/larr1', self.h5file) + # Remove the soft link itself + self.h5file.root.group1.larr1.remove() + self.assertNotIn('/group1/larr1', self.h5file) + + def test03_copy(self): + """Copying a soft link.""" + + # Copy the link into another location + root = self.h5file.root + lgroup1 = root.lgroup1 + lgroup2 = lgroup1.copy('/', 'lgroup2') + self.assertIn('/lgroup1', self.h5file) + self.assertIn('/lgroup2', self.h5file) + self.assertIn('lgroup2', root._v_children) + self.assertIn('lgroup2', root._v_links) + if common.verbose: + print("Copied link:", lgroup2) + # Remove the first link + lgroup1.remove() + self._checkEqualityGroup(self.h5file.root.group1, + self.h5file.root.lgroup2()) + + def test03_overwrite(self): + """Overwrite a soft link.""" + + # Copy the link into another location + root = self.h5file.root + lgroup1 = root.lgroup1 + lgroup2 = lgroup1.copy('/', 'lgroup2') + lgroup2 = lgroup1.copy('/', 'lgroup2', overwrite=True) + self.assertIn('/lgroup1', self.h5file) + self.assertIn('/lgroup2', self.h5file) + self.assertIn('lgroup2', root._v_children) + self.assertIn('lgroup2', root._v_links) + if common.verbose: + print("Copied link:", lgroup2) + # Remove the first link + lgroup1.remove() + self._checkEqualityGroup(self.h5file.root.group1, + self.h5file.root.lgroup2()) + + def test04_move(self): + """Moving a soft link.""" + + # Move the link into another location + lgroup1 = self.h5file.root.lgroup1 + group2 = self.h5file.create_group('/', 'group2') + lgroup1.move(group2, 'lgroup2') + lgroup2 = self.h5file.root.group2.lgroup2 + if common.verbose: + print("Moved link:", lgroup2) + self.assertNotIn('/lgroup1', self.h5file) + self.assertIn('/group2/lgroup2', self.h5file) + self._checkEqualityGroup(self.h5file.root.group1, + self.h5file.root.group2.lgroup2()) + + def test05_rename(self): + """Renaming a soft link.""" + + # Rename the link + lgroup1 = self.h5file.root.lgroup1 + lgroup1.rename('lgroup2') + lgroup2 = self.h5file.root.lgroup2 + if common.verbose: + print("Moved link:", lgroup2) + self.assertNotIn('/lgroup1', self.h5file) + self.assertIn('/lgroup2', self.h5file) + self._checkEqualityGroup(self.h5file.root.group1, + self.h5file.root.lgroup2()) + + def test06a_relative_path(self): + """Using soft links with relative paths.""" + + # Create new group + self.h5file.create_group('/group1', 'group3') + # ... and relative link + lgroup3 = self.h5file.create_soft_link( + '/group1', 'lgroup3', 'group3') + if common.verbose: + print("Relative path link:", lgroup3) + self.assertIn('/group1/lgroup3', self.h5file) + self._checkEqualityGroup(self.h5file.root.group1.group3, + self.h5file.root.group1.lgroup3()) + + def test06b_relative_path(self): + """Using soft links with relative paths (./ version)""" + + # Create new group + self.h5file.create_group('/group1', 'group3') + # ... and relative link + lgroup3 = self.h5file.create_soft_link( + '/group1', 'lgroup3', './group3') + if common.verbose: + print("Relative path link:", lgroup3) + self.assertIn('/group1/lgroup3', self.h5file) + self._checkEqualityGroup(self.h5file.root.group1.group3, + self.h5file.root.group1.lgroup3()) + + def test07_walkNodes(self): + """Checking `walk_nodes` with `classname` option.""" + + links = [node._v_pathname for node in + self.h5file.walk_nodes('/', classname="Link")] + if common.verbose: + print("detected links (classname='Link'):", links) + self.assertEqual(links, ['/larr2', '/lgroup1', '/group1/larr1']) + links = [node._v_pathname for node in + self.h5file.walk_nodes('/', classname="SoftLink")] + if common.verbose: + print("detected links (classname='SoftLink'):", links) + self.assertEqual(links, ['/larr2', '/lgroup1', '/group1/larr1']) + + def test08__v_links(self): + """Checking `Group._v_links`.""" + + links = [node for node in self.h5file.root._v_links] + if common.verbose: + print("detected links (under root):", links) + self.assertEqual(len(links), 2) + links = [node for node in self.h5file.root.group1._v_links] + if common.verbose: + print("detected links (under /group1):", links) + self.assertEqual(links, ['larr1']) + + def test09_link_to_link(self): + """Checking linked links.""" + + # Create a link to another existing link + lgroup2 = self.h5file.create_soft_link( + '/', 'lgroup2', '/lgroup1') + # Dereference it once: + self.assertIs(lgroup2(), self.h5file.get_node('/lgroup1')) + if common.verbose: + print("First dereference is correct:", lgroup2()) + # Dereference it twice: + self.assertIs(lgroup2()(), self.h5file.get_node('/group1')) + if common.verbose: + print("Second dereference is correct:", lgroup2()()) + + def test10_copy_link_to_file(self): + """Checking copying a link to another file.""" + + fname = tempfile.mktemp(".h5") + h5f = tb.open_file(fname, "a") + h5f.create_array('/', 'arr1', [1, 2]) + h5f.create_group('/', 'group1') + lgroup1 = self.h5file.root.lgroup1 + lgroup1_ = lgroup1.copy(h5f.root, 'lgroup1') + self.assertIn('/lgroup1', self.h5file) + self.assertIn('/lgroup1', h5f) + self.assertIn(lgroup1_, h5f) + if common.verbose: + print("Copied link:", lgroup1_, 'in:', lgroup1_._v_file.filename) + h5f.close() + Path(fname).unlink() + + def test11_direct_attribute_access(self): + """Check direct get/set attributes via link-->target.attribute""" + + larr1 = self.h5file.get_node('/lgroup1/larr1') + arr1 = self.h5file.get_node('/arr1') + # get + self.assertEqual(larr1.shape, (2,)) + self.assertEqual(larr1[:], [1, 2]) + # set + larr1[0] = -1 + self.assertEqual(arr1[:], [-1, 2]) + + def test12_access_child_node_attributes(self): + """Check get/set attributes via link-->target.child.attribute""" + + lgroup1 = self.h5file.get_node('/lgroup1') + arr2 = self.h5file.get_node('/group1/arr2') + # get child attribute + self.assertEqual(lgroup1.arr2[:], [1, 2, 3]) + # set child attribute + lgroup1.arr2[0] = -1 + self.assertEqual(arr2[:], [-1, 2, 3]) + + def test13_direct_attribute_access_via_chained_softlinks(self): + """Check get/set access via link2-->link1-->target.child.attribute""" + + self.h5file.get_node('/lgroup1') + arr2 = self.h5file.get_node('/group1/arr2') + # multiple chained links + l_lgroup1 = self.h5file.create_soft_link('/', 'l_lgroup1', '/lgroup1') + # get child attribute + self.assertEqual(l_lgroup1.arr2[:], [1, 2, 3]) + # set child attribute + l_lgroup1.arr2[0] = -1 + self.assertEqual(arr2[:], [-1, 2, 3]) + + def test14_child_of_softlink_to_group(self): + """Create an array whose parent is a softlink to another group""" + + self.h5file.get_node('/group1') + lgroup1 = self.h5file.get_node('/lgroup1') + self.h5file.create_array(lgroup1, 'new_arr', obj=[1, 2, 3]) + new_arr2 = self.h5file.get_node('/group1/new_arr') + self.assertEqual(new_arr2[:], [1, 2, 3]) + + def test_str(self): + s = str(self.h5file) + self.assertEqual(len(re.findall(r'\(SoftLink\)', s)), 3) + self.assertEqual(len(re.findall(r'\(dangling\)', s)), 0) + + def test_str_with_dangling_link(self): + self.h5file.root.group1.arr2.remove() + s = str(self.h5file) + self.assertEqual(len(re.findall(r'\(SoftLink\)', s)), 3) + self.assertEqual(len(re.findall(r'\(dangling\)', s)), 1) + + +# Test for external links +@common.unittest.skipIf(tb.file._FILE_OPEN_POLICY == 'strict', + 'FILE_OPEN_POLICY = "strict"') +class ExternalLinkTestCase(common.TempFileMixin, common.PyTablesTestCase): + + def setUp(self): + super().setUp() + + self.extfname = tempfile.mktemp(".h5") + self.exth5file = tb.open_file(self.extfname, "w") + self._createFile() + + def tearDown(self): + """Remove ``extfname``.""" + + extfname = self.extfname + self.exth5file.close() + super().tearDown() + + # open_files = tables.file._open_files + # if self.extfname in open_files: + # #assert False + # for handler in open_files.get_handlers_by_name(self.extfname): + # handler.close() + + Path(extfname).unlink() # comment this for debugging purposes only + + def _createFile(self): + self.h5file.create_array('/', 'arr1', [1, 2]) + group1 = self.h5file.create_group('/', 'group1') + self.h5file.create_array(group1, 'arr2', [1, 2, 3]) + + # The external file + extarr1 = self.exth5file.create_array('/', 'arr1', [1, 2]) + self.assertIsNotNone(extarr1) + extgroup1 = self.exth5file.create_group('/', 'group1') + extarr2 = self.exth5file.create_array(extgroup1, 'arr2', [1, 2, 3]) + + # Create external links + lgroup1 = self.h5file.create_external_link( + '/', 'lgroup1', '%s:/group1' % self.extfname) + self.assertIsNotNone(lgroup1) + larr1 = self.h5file.create_external_link( + group1, 'larr1', '%s:/arr1' % self.extfname) + self.assertIsNotNone(larr1) + larr2 = self.h5file.create_external_link('/', 'larr2', extarr2) + self.assertIsNotNone(larr2) + + # Re-open the external file in 'r'ead-only mode + self.exth5file.close() + self.exth5file = tb.open_file(self.extfname, "r") + + def test00_create(self): + """Creating soft links.""" + + self._checkEqualityGroup(self.exth5file.root.group1, + self.h5file.root.lgroup1()) + self._checkEqualityLeaf(self.exth5file.root.arr1, + self.h5file.root.group1.larr1()) + self._checkEqualityLeaf(self.h5file.root.lgroup1().arr2, + self.h5file.root.larr2()) + + def test01_open(self): + """Opening a file with soft links.""" + + self._reopen() + self._checkEqualityGroup(self.exth5file.root.group1, + self.h5file.root.lgroup1()) + self._checkEqualityLeaf(self.exth5file.root.arr1, + self.h5file.root.group1.larr1()) + self._checkEqualityLeaf(self.h5file.root.lgroup1().arr2, + self.h5file.root.larr2()) + + def test02_remove(self): + """Removing an external link.""" + + # Re-open the external file in 'a'ppend mode + self.exth5file.close() + self.exth5file = tb.open_file(self.extfname, "a") + + # First delete the referred link + self.exth5file.root.arr1.remove() + self.assertNotIn('/arr1', self.exth5file) + + # The external link should still be there (but dangling) + if common.verbose: + print("Dangling link:", self.h5file.root.group1.larr1) + self.assertIn('/group1/larr1', self.h5file) + + # Remove the external link itself + self.h5file.root.group1.larr1.remove() + self.assertNotIn('/group1/larr1', self.h5file) + + def test03_copy(self): + """Copying an external link.""" + + # Copy the link into another location + root = self.h5file.root + lgroup1 = root.lgroup1 + lgroup2 = lgroup1.copy('/', 'lgroup2') + self.assertIn('/lgroup1', self.h5file) + self.assertIn('/lgroup2', self.h5file) + self.assertIn('lgroup2', root._v_children) + self.assertIn('lgroup2', root._v_links) + if common.verbose: + print("Copied link:", lgroup2) + + # Remove the first link + lgroup1.remove() + self._checkEqualityGroup(self.exth5file.root.group1, + self.h5file.root.lgroup2()) + + def test03_overwrite(self): + """Overwrite an external link.""" + + # Copy the link into another location + root = self.h5file.root + lgroup1 = root.lgroup1 + lgroup2 = lgroup1.copy('/', 'lgroup2') + lgroup2 = lgroup1.copy('/', 'lgroup2', overwrite=True) + self.assertIn('/lgroup1', self.h5file) + self.assertIn('/lgroup2', self.h5file) + self.assertIn('lgroup2', root._v_children) + self.assertIn('lgroup2', root._v_links) + if common.verbose: + print("Copied link:", lgroup2) + + # Remove the first link + lgroup1.remove() + self._checkEqualityGroup(self.exth5file.root.group1, + self.h5file.root.lgroup2()) + + def test04_move(self): + """Moving an external link.""" + + # Move the link into another location + lgroup1 = self.h5file.root.lgroup1 + group2 = self.h5file.create_group('/', 'group2') + lgroup1.move(group2, 'lgroup2') + lgroup2 = self.h5file.root.group2.lgroup2 + if common.verbose: + print("Moved link:", lgroup2) + self.assertNotIn('/lgroup1', self.h5file) + self.assertIn('/group2/lgroup2', self.h5file) + self._checkEqualityGroup(self.exth5file.root.group1, + self.h5file.root.group2.lgroup2()) + + def test05_rename(self): + """Renaming an external link.""" + + # Rename the link + lgroup1 = self.h5file.root.lgroup1 + lgroup1.rename('lgroup2') + lgroup2 = self.h5file.root.lgroup2 + if common.verbose: + print("Moved link:", lgroup2) + self.assertNotIn('/lgroup1', self.h5file) + self.assertIn('/lgroup2', self.h5file) + self._checkEqualityGroup(self.exth5file.root.group1, + self.h5file.root.lgroup2()) + + def test07_walkNodes(self): + """Checking `walk_nodes` with `classname` option.""" + + # Create a new soft link + self.h5file.create_soft_link('/group1', 'lgroup3', './group3') + links = [node._v_pathname for node in + self.h5file.walk_nodes('/', classname="Link")] + if common.verbose: + print("detected links (classname='Link'):", links) + self.assertEqual(links, ['/larr2', '/lgroup1', + '/group1/larr1', '/group1/lgroup3']) + links = [node._v_pathname for node in + self.h5file.walk_nodes('/', classname="ExternalLink")] + if common.verbose: + print("detected links (classname='ExternalLink'):", links) + self.assertEqual(links, ['/larr2', '/lgroup1', '/group1/larr1']) + + def test08__v_links(self): + """Checking `Group._v_links`.""" + + links = [node for node in self.h5file.root._v_links] + if common.verbose: + print("detected links (under root):", links) + self.assertEqual(len(links), 2) + links = [node for node in self.h5file.root.group1._v_links] + if common.verbose: + print("detected links (under /group1):", links) + self.assertEqual(links, ['larr1']) + + def test09_umount(self): + """Checking `umount()` method.""" + + link = self.h5file.root.lgroup1 + self.assertIsNone(link.extfile) + + # Dereference a external node (and hence, 'mount' a file) + enode = link() + self.assertIsNotNone(enode) + self.assertIsNotNone(link.extfile) + + # Umount the link + link.umount() + self.assertIsNone(link.extfile) + + def test10_copy_link_to_file(self): + """Checking copying a link to another file.""" + + h5fname2 = tempfile.mktemp(".h5") + try: + with tb.open_file(h5fname2, "a") as h5file2: + h5file2.create_array('/', 'arr1', [1, 2]) + h5file2.create_group('/', 'group1') + lgroup1 = self.h5file.root.lgroup1 + lgroup1_ = lgroup1.copy(h5file2.root, 'lgroup1') + self.assertIn('/lgroup1', self.h5file) + self.assertIn('/lgroup1', h5file2) + self.assertIn(lgroup1_, h5file2) + if common.verbose: + print("Copied link:", lgroup1_, 'in:', + lgroup1_._v_file.filename) + finally: + if Path(h5fname2).is_file(): + Path(h5fname2).unlink() + + +def suite(): + """Return a test suite consisting of all the test cases in the module.""" + + theSuite = common.unittest.TestSuite() + niter = 1 + # common.heavy = 1 # uncomment this only for testing purposes + + for i in range(niter): + theSuite.addTest(common.unittest.makeSuite(HardLinkTestCase)) + theSuite.addTest(common.unittest.makeSuite(SoftLinkTestCase)) + theSuite.addTest(common.unittest.makeSuite(ExternalLinkTestCase)) + + return theSuite + + +if __name__ == '__main__': + import sys + common.parse_argv(sys.argv) + common.print_versions() + common.unittest.main(defaultTest='suite') diff --git a/tables/tests/test_lists.py b/tables/tests/test_lists.py new file mode 100644 index 0000000..15aac81 --- /dev/null +++ b/tables/tests/test_lists.py @@ -0,0 +1,455 @@ +import tempfile +from pathlib import Path + +import tables as tb +from tables.tests import common + + +def WriteRead(filename, testTuple): + if common.verbose: + print('\n', '-=' * 30) + print("Running test for object %s" % type(testTuple)) + + # Create an instance of HDF5 Table + fileh = tb.open_file(filename, mode="w") + root = fileh.root + try: + # Create the array under root and name 'somearray' + a = testTuple + fileh.create_array(root, 'somearray', a, "Some array") + finally: + # Close the file + fileh.close() + + # Re-open the file in read-only mode + fileh = tb.open_file(filename, mode="r") + root = fileh.root + + # Read the saved array + try: + b = root.somearray.read() + # Compare them. They should be equal. + if not a == b and common.verbose: + print("Write and read lists/tuples differ!") + print("Object written:", a) + print("Object read:", b) + + # Check strictly the array equality + assert a == b + finally: + # Close the file + fileh.close() + + +class BasicTestCase(common.PyTablesTestCase): + def setUp(self): + super().setUp() + self.h5fname = tempfile.mktemp(".h5") + self.h5file = None + + def tearDown(self): + if self.h5file is not None: + self.h5file.close() + if Path(self.h5fname).is_file(): + Path(self.h5fname).unlink() + super().tearDown() + + def test00_char(self): + """Data integrity during recovery (character types)""" + + a = self.charList + WriteRead(self.h5fname, a) + + def test01_types(self): + """Data integrity during recovery (numerical types)""" + + a = self.numericalList + WriteRead(self.h5fname, a) + + +class Basic0DOneTestCase(BasicTestCase): + # Scalar case + title = "Rank-0 case 1" + numericalList = 3 + charList = b"3" + + +class Basic0DTwoTestCase(BasicTestCase): + # Scalar case + title = "Rank-0 case 2" + numericalList = 33.34 + charList = b"33"*500 + +# This does not work anymore because I've splitted the chunked arrays to happen +# mainly in EArray objects +# class Basic1DZeroTestCase(BasicTestCase): +# title = "Rank-1 case 0" +# numericalList = [] +# charList = [] + + +class Basic1DOneTestCase(BasicTestCase): + # 1D case + title = "Rank-1 case 1" + numericalList = [3] + charList = [b"a"] + + +class Basic1DTwoTestCase(BasicTestCase): + # 1D case + title = "Rank-1 case 2" + numericalList = [3.2, 4.2] + charList = [b"aaa"] + + +class Basic2DTestCase(BasicTestCase): + # 2D case + title = "Rank-2 case 1" + numericalList = [[1, 2]]*5 + charList = [[b"qq", b"zz"]]*5 + + +class Basic10DTestCase(BasicTestCase): + # 10D case + title = "Rank-10 case 1" + numericalList = [[[[[[[[[[1, 2], [3, 4]]]]]]]]]]*5 + # Dimensions greather than 6 in strings gives some warnings + charList = [[[[[[[[[[b"a", b"b"], [b"qq", b"zz"]]]]]]]]]]*5 + + +class ExceptionTestCase(common.PyTablesTestCase): + def setUp(self): + super().setUp() + self.h5fname = tempfile.mktemp(".h5") + self.h5file = None + + def tearDown(self): + if self.h5file is not None: + self.h5file.close() + if Path(self.h5fname).is_file(): + Path(self.h5fname).unlink() + super().tearDown() + + def test00_char(self): + """Non suppported lists objects (character objects)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running test for %s" % (self.title)) + a = self.charList + with self.assertRaises((ValueError, TypeError)): + WriteRead(self.h5fname, a) + + def test01_types(self): + """Non supported lists object (numerical types)""" + + a = self.numericalList + with self.assertRaises((ValueError, TypeError)): + WriteRead(self.h5fname, a) + + +class Basic1DFourTestCase(ExceptionTestCase): + title = "Rank-1 case 4 (non-regular list)" + numericalList = [3, [4, 5.2]] + charList = [b"aaa", [b"bbb", b"ccc"]] + + +class GetItemTestCase(common.TempFileMixin, common.PyTablesTestCase): + def test00_single(self): + """Single element access (character types)""" + + # Create the array under root and name 'somearray' + a = self.charList + arr = self.h5file.create_array(self.h5file.root, 'somearray', a, + "Some array") + + # Get and compare an element + if common.verbose: + print("Original first element:", a[0]) + print("Read first element:", arr[0]) + self.assertEqual(a[0], arr[0]) + + def test01_single(self): + """Single element access (numerical types)""" + + # Create the array under root and name 'somearray' + a = self.numericalList + arr = self.h5file.create_array(self.h5file.root, 'somearray', a, + "Some array") + + # Get and compare an element + if common.verbose: + print("Original first element:", a[0]) + print("Read first element:", arr[0]) + self.assertEqual(a[0], arr[0]) + + def test02_range(self): + """Range element access (character types)""" + + # Create the array under root and name 'somearray' + a = self.charListME + arr = self.h5file.create_array(self.h5file.root, 'somearray', a, + "Some array") + + # Get and compare an element + if common.verbose: + print("Original elements:", a[1:4]) + print("Read elements:", arr[1:4]) + self.assertEqual(a[1:4], arr[1:4]) + + def test03_range(self): + """Range element access (numerical types)""" + + # Create the array under root and name 'somearray' + a = self.numericalListME + arr = self.h5file.create_array(self.h5file.root, 'somearray', a, + "Some array") + + # Get and compare an element + if common.verbose: + print("Original elements:", a[1:4]) + print("Read elements:", arr[1:4]) + self.assertEqual(a[1:4], arr[1:4]) + + def test04_range(self): + """Range element access, strided (character types)""" + + # Create the array under root and name 'somearray' + a = self.charListME + arr = self.h5file.create_array(self.h5file.root, 'somearray', a, + "Some array") + + # Get and compare an element + if common.verbose: + print("Original elements:", a[1:4:2]) + print("Read elements:", arr[1:4:2]) + self.assertEqual(a[1:4:2], arr[1:4:2]) + + def test05_range(self): + """Range element access (numerical types)""" + + # Create the array under root and name 'somearray' + a = self.numericalListME + arr = self.h5file.create_array(self.h5file.root, 'somearray', a, + "Some array") + + # Get and compare an element + if common.verbose: + print("Original elements:", a[1:4:2]) + print("Read elements:", arr[1:4:2]) + self.assertEqual(a[1:4:2], arr[1:4:2]) + + def test06_negativeIndex(self): + """Negative Index element access (character types)""" + + # Create the array under root and name 'somearray' + a = self.charListME + arr = self.h5file.create_array(self.h5file.root, 'somearray', a, + "Some array") + + # Get and compare an element + if common.verbose: + print("Original last element:", a[-1]) + print("Read last element:", arr[-1]) + self.assertEqual(a[-1], arr[-1]) + + def test07_negativeIndex(self): + """Negative Index element access (numerical types)""" + + # Create the array under root and name 'somearray' + a = self.numericalListME + arr = self.h5file.create_array(self.h5file.root, 'somearray', a, + "Some array") + + # Get and compare an element + if common.verbose: + print("Original before last element:", a[-2]) + print("Read before last element:", arr[-2]) + self.assertEqual(a[-2], arr[-2]) + + def test08_negativeRange(self): + """Negative range element access (character types)""" + + # Create the array under root and name 'somearray' + a = self.charListME + arr = self.h5file.create_array(self.h5file.root, 'somearray', a, + "Some array") + + # Get and compare an element + if common.verbose: + print("Original last elements:", a[-4:-1]) + print("Read last elements:", arr[-4:-1]) + self.assertEqual(a[-4:-1], arr[-4:-1]) + + def test09_negativeRange(self): + """Negative range element access (numerical types)""" + + # Create the array under root and name 'somearray' + a = self.numericalListME + arr = self.h5file.create_array(self.h5file.root, 'somearray', a, + "Some array") + + # Get and compare an element + if common.verbose: + print("Original last elements:", a[-4:-1]) + print("Read last elements:", arr[-4:-1]) + self.assertEqual(a[-4:-1], arr[-4:-1]) + + +class GI1ListTestCase(GetItemTestCase): + title = "Rank-1 case 1 (lists)" + numericalList = [3] + numericalListME = [3, 2, 1, 0, 4, 5, 6] + charList = [b"3"] + charListME = [b"321", b"221", b"121", b"021", b"421", b"521", b"621"] + + +class GI2ListTestCase(GetItemTestCase): + # A more complex example + title = "Rank-1,2 case 2 (lists)" + numericalList = [3, 4] + numericalListME = [[3, 2, 1, 0, 4, 5, 6], + [2, 1, 0, 4, 5, 6, 7], + [4, 3, 2, 1, 0, 4, 5], + [3, 2, 1, 0, 4, 5, 6], + [3, 2, 1, 0, 4, 5, 6]] + + charList = [b"a", b"b"] + charListME = [ + [b"321", b"221", b"121", b"021", b"421", b"521", b"621"], + [b"21", b"21", b"11", b"02", b"42", b"21", b"61"], + [b"31", b"21", b"12", b"21", b"41", b"51", b"621"], + [b"321", b"221", b"121", b"021", b"421", b"521", b"621"], + [b"3241", b"2321", b"13216", b"0621", b"4421", b"5421", b"a621"], + [b"a321", b"s221", b"d121", b"g021", b"b421", b"5vvv21", b"6zxzxs21"], + ] + + +class GeneratorTestCase(common.TempFileMixin, common.PyTablesTestCase): + def test00a_single(self): + """Testing generator access to Arrays, single elements (char)""" + + # Create the array under root and name 'somearray' + a = self.charList + arr = self.h5file.create_array(self.h5file.root, 'somearray', a, + "Some array") + + # Get and compare an element + ga = [i for i in a] + garr = [i for i in arr] + if common.verbose: + print("Result of original iterator:", ga) + print("Result of read generator:", garr) + self.assertEqual(ga, garr) + + def test00b_me(self): + """Testing generator access to Arrays, multiple elements (char)""" + + # Create the array under root and name 'somearray' + a = self.charListME + arr = self.h5file.create_array(self.h5file.root, 'somearray', a, + "Some array") + + # Get and compare an element + if isinstance(a[0], tuple): + ga = [list(i) for i in a] + else: + ga = [i for i in a] + garr = [i for i in arr] + if common.verbose: + print("Result of original iterator:", ga) + print("Result of read generator:", garr) + self.assertEqual(ga, garr) + + def test01a_single(self): + """Testing generator access to Arrays, single elements (numeric)""" + + # Create the array under root and name 'somearray' + a = self.numericalList + arr = self.h5file.create_array(self.h5file.root, 'somearray', a, + "Some array") + + # Get and compare an element + ga = [i for i in a] + garr = [i for i in arr] + if common.verbose: + print("Result of original iterator:", ga) + print("Result of read generator:", garr) + self.assertEqual(ga, garr) + + def test01b_me(self): + """Testing generator access to Arrays, multiple elements (numeric)""" + + # Create the array under root and name 'somearray' + a = self.numericalListME + arr = self.h5file.create_array(self.h5file.root, 'somearray', a, + "Some array") + + # Get and compare an element + if isinstance(a[0], tuple): + ga = [list(i) for i in a] + else: + ga = [i for i in a] + garr = [i for i in arr] + if common.verbose: + print("Result of original iterator:", ga) + print("Result of read generator:", garr) + self.assertEqual(ga, garr) + + +class GE1ListTestCase(GeneratorTestCase): + # Scalar case + title = "Rank-1 case 1 (lists)" + numericalList = [3] + numericalListME = [3, 2, 1, 0, 4, 5, 6] + charList = [b"3"] + charListME = [b"321", b"221", b"121", b"021", b"421", b"521", b"621"] + + +class GE2ListTestCase(GeneratorTestCase): + # Scalar case + title = "Rank-1,2 case 2 (lists)" + numericalList = [3, 4] + numericalListME = [[3, 2, 1, 0, 4, 5, 6], + [2, 1, 0, 4, 5, 6, 7], + [4, 3, 2, 1, 0, 4, 5], + [3, 2, 1, 0, 4, 5, 6], + [3, 2, 1, 0, 4, 5, 6]] + + charList = [b"a", b"b"] + charListME = [ + [b"321", b"221", b"121", b"021", b"421", b"521", b"621"], + [b"21", b"21", b"11", b"02", b"42", b"21", b"61"], + [b"31", b"21", b"12", b"21", b"41", b"51", b"621"], + [b"321", b"221", b"121", b"021", b"421", b"521", b"621"], + [b"3241", b"2321", b"13216", b"0621", b"4421", b"5421", b"a621"], + [b"a321", b"s221", b"d121", b"g021", b"b421", b"5vvv21", b"6zxzxs21"], + ] + + +def suite(): + theSuite = common.unittest.TestSuite() + niter = 1 + + for i in range(niter): + theSuite.addTest(common.unittest.makeSuite(Basic0DOneTestCase)) + theSuite.addTest(common.unittest.makeSuite(Basic0DTwoTestCase)) + # theSuite.addTest(unittest.makeSuite(Basic1DZeroTestCase)) + theSuite.addTest(common.unittest.makeSuite(Basic1DOneTestCase)) + theSuite.addTest(common.unittest.makeSuite(Basic1DTwoTestCase)) + theSuite.addTest(common.unittest.makeSuite(Basic1DFourTestCase)) + theSuite.addTest(common.unittest.makeSuite(Basic2DTestCase)) + theSuite.addTest(common.unittest.makeSuite(Basic10DTestCase)) + theSuite.addTest(common.unittest.makeSuite(GI1ListTestCase)) + theSuite.addTest(common.unittest.makeSuite(GI2ListTestCase)) + theSuite.addTest(common.unittest.makeSuite(GE1ListTestCase)) + theSuite.addTest(common.unittest.makeSuite(GE2ListTestCase)) + + return theSuite + + +if __name__ == '__main__': + import sys + common.parse_argv(sys.argv) + common.print_versions() + common.unittest.main(defaultTest='suite') diff --git a/tables/tests/test_nestedtypes.py b/tables/tests/test_nestedtypes.py new file mode 100644 index 0000000..c2bb33d --- /dev/null +++ b/tables/tests/test_nestedtypes.py @@ -0,0 +1,1495 @@ +"""Test module for nested types under PyTables.""" + +import sys +import itertools + +import numpy as np + +import tables as tb +from tables.tests import common + +minRowIndex = 10 + + +# This is the structure of the table used for testing (DON'T PANIC!): +# +# +-+---------------------------------+-----+----------+-+-+ +# |x|Info |color|info |y|z| +# | +-----+--+----------------+----+--+ +----+-----+ | | +# | |value|y2|Info2 |name|z2| |Name|Value| | | +# | | | +----+-----+--+--+ | | | | | | | +# | | | |name|value|y3|z3| | | | | | | | +# +-+-----+--+----+-----+--+--+----+--+-----+----+-----+-+-+ +# +# Please note that some fields are explicitly ordered while others are +# ordered alphabetically by name. +# The declaration of the nested table: +class Info(tb.IsDescription): + _v_pos = 3 + Name = tb.StringCol(itemsize=2) + Value = tb.ComplexCol(itemsize=16) + + +class TestTDescr(tb.IsDescription): + """A description that has several nested columns.""" + + x = tb.Int32Col(dflt=0, shape=2, pos=0) # 0 + y = tb.Float64Col(dflt=1, shape=(2, 2)) + z = tb.UInt8Col(dflt=1) + color = tb.StringCol(itemsize=2, dflt=b" ", pos=2) + info = Info() + + class Info(tb.IsDescription): # 1 + _v_pos = 1 + name = tb.StringCol(itemsize=2) + value = tb.ComplexCol(itemsize=16, pos=0) # 0 + y2 = tb.Float64Col(dflt=1, pos=1) # 1 + z2 = tb.UInt8Col(dflt=1) + + class Info2(tb.IsDescription): + y3 = tb.Time64Col(dflt=1, shape=2) + z3 = tb.EnumCol({'r': 4, 'g': 2, 'b': 1}, 'r', 'int32', shape=2) + name = tb.StringCol(itemsize=2) + value = tb.ComplexCol(itemsize=16, shape=2) + + +# The corresponding nested array description: +testADescr = [ + ('x', '(2,)int32'), + ('Info', [ + ('value', 'complex128'), + ('y2', 'float64'), + ('Info2', [ + ('name', 'a2'), + ('value', '(2,)complex128'), + ('y3', '(2,)float64'), + ('z3', '(2,)int32')]), + ('name', 'a2'), + ('z2', 'uint8')]), + ('color', 'a2'), + ('info', [ + ('Name', 'a2'), + ('Value', 'complex128')]), + ('y', '(2,2)float64'), + ('z', 'uint8')] + +# The corresponding nested array description (brief version): +testADescr2 = [ + ('x', '(2,)i4'), + ('Info', [ + ('value', '()c16'), + ('y2', '()f8'), + ('Info2', [ + ('name', '()S2'), + ('value', '(2,)c16'), + ('y3', '(2,)f8'), + ('z3', '(2,)i4')]), + ('name', '()S2'), + ('z2', '()u1')]), + ('color', '()S2'), + ('info', [ + ('Name', '()S2'), + ('Value', '()c16')]), + ('y', '(2, 2)f8'), + ('z', '()u1')] + +# A nested array for testing: +testABuffer = [ + # x Info color info y z + # value y2 Info2 name z2 Name Value + # name value y3 z3 + ((3, 2), (6j, 6., ('nn', (6j, 4j), (6., 4.), (1, 2)), + 'NN', 8), 'cc', ('NN', 6j), ((6., 4.), (6., 4.)), 8), + ((4, 3), (7j, 7., ('oo', (7j, 5j), (7., 5.), (2, 1)), + 'OO', 9), 'dd', ('OO', 7j), ((7., 5.), (7., 5.)), 9), +] +testAData = np.array(testABuffer, dtype=testADescr) +# The name of the column to be searched: +testCondCol = 'Info/z2' +# The name of a nested column (it can not be searched): +testNestedCol = 'Info' +# The condition to be applied on the column (all but the last row match it): +testCondition = '(2 < col) & (col < 9)' + + +def areDescriptionsEqual(desc1, desc2): + """Are both `desc1` and `desc2` equivalent descriptions? + + The arguments may be description objects (``IsDescription``, + ``Description``) or dictionaries. + + """ + + if isinstance(desc1, tb.Col): + # This is a rough comparison but it suffices here. + return (desc1.type == desc2.type + and desc2.dtype == desc2.dtype + and desc1._v_pos == desc2._v_pos + # and desc1.dflt == desc2.dflt) + and common.areArraysEqual(desc1.dflt, desc2.dflt)) + + if hasattr(desc1, '_v_colobjects'): # quacks like a Description + cols1 = desc1._v_colobjects + elif hasattr(desc1, 'columns'): # quacks like an IsDescription + cols1 = desc1.columns + else: # hope it quacks like a dictionary + cols1 = desc1 + + if hasattr(desc2, '_v_colobjects'): # quacks like a Description + cols2 = desc2._v_colobjects + elif hasattr(desc2, 'columns'): # quacks like an IsDescription + cols2 = desc2.columns + else: # hope it quacks like a dictionary + cols2 = desc2 + + if len(cols1) != len(cols2): + return False + + for (colName, colobj1) in cols1.items(): + colobj2 = cols2[colName] + if colName == '_v_pos': + # The comparison may not be quite exhaustive! + return colobj1 == colobj2 + if not areDescriptionsEqual(colobj1, colobj2): + return False + + return True + + +# Test creating nested column descriptions +class DescriptionTestCase(common.PyTablesTestCase): + _TestTDescr = TestTDescr + _testADescr = testADescr + _testADescr2 = testADescr2 + _testAData = testAData + + def test00_instance(self): + """Creating an instance of a nested description.""" + + self.assertTrue( + areDescriptionsEqual(self._TestTDescr, self._TestTDescr()), + "Table description does not match the given one.") + + def test01_instance(self): + """Checking attrs of an instance of a nested description.""" + + descr = tb.description.Description(self._TestTDescr().columns) + if common.verbose: + print("Generated description:", descr._v_nested_descr) + print("Should look like:", self._testADescr2) + self.assertEqual(self._testADescr2, descr._v_nested_descr, + "Description._v_nested_descr does not match.") + + +# Test creating a nested table and opening it +class CreateTestCase(common.TempFileMixin, common.PyTablesTestCase): + _TestTDescr = TestTDescr + _testABuffer = testABuffer + _testAData = testAData + + def _checkColumns(self, cols, desc): + """Check that `cols` has all the accessors for `self._TestTDescr`.""" + + # ``_desc`` is a leaf column and ``cols`` a ``Column``. + if isinstance(desc, tb.Col): + return isinstance(cols, tb.Column) + + # ``_desc`` is a description object and ``cols`` a ``Cols``. + descColumns = desc._v_colobjects + for colName in descColumns: + if colName not in cols._v_colnames: + return False + if not self._checkColumns(cols._f_col(colName), + descColumns[colName]): + return False + + return True + + def _checkDescription(self, table): + """Check that description of `table` matches `self._TestTDescr`.""" + + # Compare descriptions. + self.assertTrue( + areDescriptionsEqual(self._TestTDescr, table.description), + "Table description does not match the given one.") + # Check access to columns. + self._checkColumns(table.cols, table.description) + + def _checkColinstances(self, table): + """Check that ``colinstances`` and ``cols`` of `table` match.""" + for colpathname in table.description._v_pathnames: + self.assertTrue(table.colinstances[colpathname] + is table.cols._f_col(colpathname)) + + def test00_create(self): + """Creating a nested table.""" + + tbl = self.h5file.create_table( + '/', 'test', self._TestTDescr, title=self._getMethodName()) + self._checkDescription(tbl) + self._checkColinstances(tbl) + + def test01_open(self): + """Opening a nested table.""" + + self.h5file.create_table( + '/', 'test', self._TestTDescr, title=self._getMethodName()) + self._reopen() + tbl = self.h5file.root.test + self._checkDescription(tbl) + self._checkColinstances(tbl) + + def test02_NestedRecArrayCompat(self): + """Creating a compatible nested record array``.""" + + tbl = self.h5file.create_table( + '/', 'test', self._TestTDescr, title=self._getMethodName()) + + nrarr = np.array(testABuffer, dtype=tbl.description._v_nested_descr) + self.assertTrue(common.areArraysEqual(nrarr, self._testAData), + "Can not create a compatible structured array.") + + def test03_NRA(self): + """Creating a table from a nested record array object.""" + + tbl = self.h5file.create_table( + '/', 'test', self._testAData, title=self._getMethodName()) + tbl.flush() + readAData = tbl.read() + if common.verbose: + print("Read data:", readAData) + print("Should look like:", self._testAData) + self.assertTrue(common.areArraysEqual(self._testAData, readAData), + "Written and read values differ.") + + def test04_NRA2(self): + """Creating a table from a generated nested record array object.""" + + tbl = self.h5file.create_table( + '/', 'test', self._TestTDescr, title=self._getMethodName()) + tbl.append(self._testAData) + readAData = tbl.read() + + tbl2 = self.h5file.create_table( + '/', 'test2', readAData, title=self._getMethodName()) + readAData2 = tbl2.read() + + self.assertTrue(common.areArraysEqual(self._testAData, readAData2), + "Written and read values differ.") + + +# Test writing data in a nested table +class WriteTestCase(common.TempFileMixin, common.PyTablesTestCase): + _TestTDescr = TestTDescr + _testAData = testAData + _testCondition = testCondition + _testCondCol = testCondCol + _testNestedCol = testNestedCol + + def _testCondVars(self, table): + """Get condition variables for the given `table`.""" + return {'col': table.cols._f_col(self._testCondCol)} + + def _testNestedCondVars(self, table): + """Get condition variables for the given `table`.""" + return {'col': table.cols._f_col(self._testNestedCol)} + + def _appendRow(self, row, index): + """ + Append the `index`-th row in `self._testAData` to `row`. + + Values are set field-by-field (be it nested or not). + """ + + record = self._testAData[index] + for fieldName in self._testAData.dtype.names: + row[fieldName] = record[fieldName] + row.append() + + def test00_append(self): + """Appending a set of rows.""" + + tbl = self.h5file.create_table( + '/', 'test', self._TestTDescr, title=self._getMethodName()) + tbl.append(self._testAData) + tbl.flush() + + if self.reopen: + self._reopen() + tbl = self.h5file.root.test + + readAData = tbl.read() + self.assertTrue(common.areArraysEqual(self._testAData, readAData), + "Written and read values differ.") + + def test01_row(self): + """Appending individual rows.""" + + tbl = self.h5file.create_table( + '/', 'test', self._TestTDescr, title=self._getMethodName()) + + row = tbl.row + # Add the first row + self._appendRow(row, 0) + # Add the rest of the rows field by field. + for i in range(1, len(self._testAData)): + self._appendRow(row, i) + tbl.flush() + + if self.reopen: + self._reopen() + tbl = self.h5file.root.test + + readAData = tbl.read() + self.assertTrue(common.areArraysEqual(self._testAData, readAData), + "Written and read values differ.") + + def test02_where(self): + """Searching nested data.""" + + tbl = self.h5file.create_table( + '/', 'test', self._TestTDescr, title=self._getMethodName()) + tbl.append(self._testAData) + tbl.flush() + + if self.reopen: + self._reopen() + tbl = self.h5file.root.test + + searchedCoords = tbl.get_where_list( + self._testCondition, self._testCondVars(tbl)) + + # All but the last row match the condition. + searchedCoords.sort() + self.assertEqual(searchedCoords.tolist(), + list(range(len(self._testAData) - 1)), + "Search returned incorrect results.") + + def test02b_whereAppend(self): + """Searching nested data and appending it to another table.""" + + tbl1 = self.h5file.create_table( + '/', 'test1', self._TestTDescr, title=self._getMethodName()) + tbl1.append(self._testAData) + tbl1.flush() + + tbl2 = self.h5file.create_table( + '/', 'test2', self._TestTDescr, title=self._getMethodName()) + tbl1.append_where( + tbl2, self._testCondition, self._testCondVars(tbl1)) + + if self.reopen: + self._reopen() + tbl1 = self.h5file.root.test1 + tbl2 = self.h5file.root.test2 + + searchedCoords = tbl2.get_where_list( + self._testCondition, self._testCondVars(tbl2)) + + # All but the last row match the condition. + searchedCoords.sort() + self.assertEqual(searchedCoords.tolist(), + list(range(len(self._testAData) - 1)), + "Search returned incorrect results.") + + def test03_colscond(self): + """Searching on a column with nested columns.""" + + tbl = self.h5file.create_table( + '/', 'test', self._TestTDescr, title=self._getMethodName()) + tbl.append(self._testAData) + tbl.flush() + + if self.reopen: + self._reopen() + tbl = self.h5file.root.test + + self.assertRaises( + TypeError, tbl.get_where_list, + self._testCondition, self._testNestedCondVars(tbl)) + + def test04_modifyColumn(self): + """Modifying one single nested column (modify_column).""" + + tbl = self.h5file.create_table( + '/', 'test', self._TestTDescr, title=self._getMethodName()) + tbl.append(self._testAData) + tbl.flush() + + nColumn = self._testNestedCol + # Get the nested column data and swap the first and last rows. + raTable = self._testAData.copy() + raColumn = raTable[nColumn] + # The next will not work until NestedRecords supports copies + (raColumn[0], raColumn[-1]) = (raColumn[-1], raColumn[0]) + + # Write the resulting column and re-read the whole table. + tbl.modify_column(colname=nColumn, column=raColumn) + tbl.flush() + + if self.reopen: + self._reopen() + tbl = self.h5file.root.test + + raReadTable = tbl.read() + if common.verbose: + print("Table read:", raReadTable) + print("Should look like:", raTable) + + # Compare it to the written one. + self.assertTrue(common.areArraysEqual(raTable, raReadTable), + "Written and read values differ.") + + def test05a_modifyColumns(self): + """Modifying one nested column (modify_columns).""" + + tbl = self.h5file.create_table( + '/', 'test', self._TestTDescr, title=self._getMethodName()) + tbl.append(self._testAData) + tbl.flush() + + nColumn = self._testNestedCol + # Get the nested column data and swap the first and last rows. + raTable = self._testAData.copy() + raColumn = raTable[nColumn] + (raColumn[0], raColumn[-1]) = (raColumn[-1].copy(), raColumn[0].copy()) + newdtype = np.dtype([(nColumn, raTable.dtype.fields[nColumn][0])]) + self.assertIsNotNone(newdtype) + + # Write the resulting column and re-read the whole table. + tbl.modify_columns(names=[nColumn], columns=raColumn) + tbl.flush() + + if self.reopen: + self._reopen() + tbl = self.h5file.root.test + + raReadTable = tbl.read() + if common.verbose: + print("Table read:", raReadTable) + print("Should look like:", raTable) + + # Compare it to the written one. + self.assertTrue(common.areArraysEqual(raTable, raReadTable), + "Written and read values differ.") + + def test05b_modifyColumns(self): + """Modifying two nested columns (modify_columns).""" + + tbl = self.h5file.create_table( + '/', 'test', self._TestTDescr, title=self._getMethodName()) + tbl.append(self._testAData) + tbl.flush() + + # Get the nested column data and swap the first and last rows. + colnames = ['x', 'color'] # Get the first two columns + raCols = np.rec.fromarrays([ + self._testAData['x'].copy(), + self._testAData['color'].copy()], + dtype=[('x', '(2,)i4'), ('color', 'a2')]) + # descr=tbl.description._v_nested_descr[0:2]) + # or... + # names=tbl.description._v_nested_names[0:2], + # formats=tbl.description._v_nested_formats[0:2]) + (raCols[0], raCols[-1]) = (raCols[-1].copy(), raCols[0].copy()) + + # Write the resulting columns + tbl.modify_columns(names=colnames, columns=raCols) + tbl.flush() + + if self.reopen: + self._reopen() + tbl = self.h5file.root.test + + # Re-read the appropriate columns + raCols2 = np.rec.fromarrays([tbl.cols._f_col('x'), + tbl.cols._f_col('color')], + dtype=raCols.dtype) + if common.verbose: + print("Table read:", raCols2) + print("Should look like:", raCols) + + # Compare it to the written one. + self.assertTrue(common.areArraysEqual(raCols, raCols2), + "Written and read values differ.") + + def test06_modifyRows(self): + """Checking modifying several rows at once (using nested rec array)""" + + tbl = self.h5file.create_table( + '/', 'test', self._TestTDescr, title=self._getMethodName()) + tbl.append(self._testAData) + tbl.flush() + + # Get the nested record and swap the first and last rows. + raTable = self._testAData.copy() + (raTable[0], raTable[-1]) = (raTable[-1].copy(), raTable[0].copy()) + + # Write the resulting nested record and re-read the whole table. + tbl.modify_rows(start=0, stop=2, rows=raTable) + tbl.flush() + + if self.reopen: + self._reopen() + tbl = self.h5file.root.test + + raReadTable = tbl.read() + if common.verbose: + print("Table read:", raReadTable) + print("Should look like:", raTable) + + # Compare it to the written one. + self.assertTrue(common.areArraysEqual(raTable, raReadTable), + "Written and read values differ.") + + def test07_index(self): + """Checking indexes of nested columns.""" + + tbl = self.h5file.create_table( + '/', 'test', self._TestTDescr, title=self._getMethodName(), + expectedrows=minRowIndex * 2) + for i in range(minRowIndex): + tbl.append(self._testAData) + tbl.flush() + coltoindex = tbl.cols._f_col(self._testCondCol) + indexrows = coltoindex.create_index() + self.assertIsNotNone(indexrows) + + if self.reopen: + self._reopen() + tbl = self.h5file.root.test + coltoindex = tbl.cols._f_col(self._testCondCol) + + if common.verbose: + print("Number of written rows:", tbl.nrows) + print("Number of indexed rows:", coltoindex.index.nelements) + + # Check indexing flags: + self.assertEqual(tbl.indexed, True, "Table not indexed") + self.assertNotEqual(coltoindex.index, None, "Column not indexed") + self.assertTrue(tbl.colindexed[ + self._testCondCol], "Column not indexed") + # Do a look-up for values + searchedCoords = tbl.get_where_list( + self._testCondition, self._testCondVars(tbl)) + searchedCoords.sort() + + expectedCoords = np.arange(0, minRowIndex * 2, 2, tb.utils.SizeType) + if common.verbose: + print("Searched coords:", searchedCoords) + print("Expected coords:", expectedCoords) + # All even rows match the condition. + self.assertEqual(searchedCoords.tolist(), expectedCoords.tolist(), + "Search returned incorrect results.") + + def test08_setNestedField(self): + """Checking modifying a nested field via natural naming.""" + # See ticket #93 (http://www.pytables.org/trac/ticket/93). + + tbl = self.h5file.create_table( + '/', 'test', self._TestTDescr, title=self._getMethodName()) + tbl.append(self._testAData) + tbl.flush() + + oldvalue = tbl.cols.Info.z2[0] + tbl.cols.Info.z2[0] = oldvalue + 1 + tbl.flush() + + if self.reopen: + self._reopen() + tbl = self.h5file.root.test + + newvalue = tbl.cols.Info.z2[0] + self.assertEqual(newvalue, oldvalue + 1) + + +class WriteNoReopen(WriteTestCase): + reopen = 0 + + +class WriteReopen(WriteTestCase): + reopen = 1 + + +class ReadTestCase(common.TempFileMixin, common.PyTablesTestCase): + _TestTDescr = TestTDescr + _testABuffer = testABuffer + _testAData = testAData + _testNestedCol = testNestedCol + + def test00a_repr(self): + """Checking representation of a nested Table.""" + + tbl = self.h5file.create_table( + '/', 'test', self._TestTDescr, title="test00") + tbl.append(self._testAData) + + if self.reopen: + self._reopen() + tbl = self.h5file.root.test + + if common.verbose: + print("str(tbl)-->", str(tbl)) + print("repr(tbl)-->", repr(tbl)) + + self.assertEqual(str(tbl), "/test (Table(2,)) 'test00'") + tblrepr = repr(tbl) + # Remove the platform-dependent information (i.e. byteorder) + tblrepr = "\n".join(tblrepr.split("\n")[:-2]) + "\n" + template = """/test (Table(2,)) 'test00' + description := { + "x": Int32Col(shape=(2,), dflt=0, pos=0), + "Info": { + "value": ComplexCol(itemsize=16, shape=(), dflt=0j, pos=0), + "y2": Float64Col(shape=(), dflt=1.0, pos=1), + "Info2": { + "name": StringCol(itemsize=2, shape=(), dflt=b'', pos=0), + "value": ComplexCol(itemsize=16, shape=(2,), dflt=0j, pos=1), + "y3": Time64Col(shape=(2,), dflt=1.0, pos=2), + "z3": EnumCol(enum=Enum({%(value)s}), dflt='%(default)s', base=Int32Atom(shape=(), dflt=0), shape=(2,), pos=3)}, + "name": StringCol(itemsize=2, shape=(), dflt=b'', pos=3), + "z2": UInt8Col(shape=(), dflt=1, pos=4)}, + "color": StringCol(itemsize=2, shape=(), dflt=b' ', pos=2), + "info": { + "Name": StringCol(itemsize=2, shape=(), dflt=b'', pos=0), + "Value": ComplexCol(itemsize=16, shape=(), dflt=0j, pos=1)}, + "y": Float64Col(shape=(2, 2), dflt=1.0, pos=4), + "z": UInt8Col(shape=(), dflt=1, pos=5)} +""" + + # The problem here is that the order in which items are stored in a + # dict can't be assumed to be stable. + # From python 3.3 on it is actually no more stable since the + # "Hash randomization" feature is enable by default. + # + # For this reason we generate a representation string for each of the + # prmutations of the Enum items. + # + # Also the default value of enum types is not preserved in HDF5. + # It is assumed that the default value is the first one in the array + # of Enum names and hence it is also affected by the issue related to + # the "Hash randomization" feature. + # + # Also in this case it is genereted a representation string for each + # of the possible default values. + enums = [ + ', '.join(items) for items in itertools.permutations( + ("'r': 4", "'b': 1", "'g': 2")) + ] + defaults = ('r', 'b', 'g') + values = [ + template % {'value': v, 'default': d} + for v, d in itertools.product(enums, defaults) + ] + self.assertIn(tblrepr, values) + + def test00b_repr(self): + """Checking representation of a root Column.""" + + tbl = self.h5file.create_table( + '/', 'test', self._TestTDescr, title="test00") + tbl.append(self._testAData) + + if self.reopen: + self._reopen() + tbl = self.h5file.root.test + + if common.verbose: + print("str(tbl.cols.y)-->'%s'" % str(tbl.cols.y)) + print("repr(tbl.cols.y)-->'%s'" % repr(tbl.cols.y)) + + self.assertEqual(str(tbl.cols.y), + "/test.cols.y (Column(2, 2, 2), float64, idx=None)") + self.assertEqual(repr(tbl.cols.y), + "/test.cols.y (Column(2, 2, 2), float64, idx=None)") + + def test00c_repr(self): + """Checking representation of a nested Column.""" + + tbl = self.h5file.create_table( + '/', 'test', self._TestTDescr, title="test00") + tbl.append(self._testAData) + + if self.reopen: + self._reopen() + tbl = self.h5file.root.test + + if common.verbose: + print("str(tbl.cols.Info.z2)-->'%s'" % str(tbl.cols.Info.z2)) + print("repr(tbl.cols.Info.z2)-->'%s'" % repr(tbl.cols.Info.z2)) + + self.assertEqual(str(tbl.cols.Info.z2), + "/test.cols.Info.z2 (Column(2,), uint8, idx=None)") + self.assertEqual(repr(tbl.cols.Info.z2), + "/test.cols.Info.z2 (Column(2,), uint8, idx=None)") + + def test01_read(self): + """Checking Table.read with subgroups with a range index with step.""" + + tbl = self.h5file.create_table( + '/', 'test', self._TestTDescr, title=self._getMethodName()) + tbl.append(self._testAData) + + if self.reopen: + self._reopen() + tbl = self.h5file.root.test + + nrarr = np.rec.array(testABuffer, + dtype=tbl.description._v_nested_descr) + tblcols = tbl.read(start=0, step=2, field='Info') + nrarrcols = nrarr['Info'][0::2] + if common.verbose: + print("Read cols:", tblcols) + print("Should look like:", nrarrcols) + self.assertTrue(common.areArraysEqual(nrarrcols, tblcols), + "Original array are retrieved doesn't match.") + + def test01_read_out_arg(self): + tbl = self.h5file.create_table( + '/', 'test', self._TestTDescr, title=self._getMethodName()) + tbl.append(self._testAData) + + if self.reopen: + self._reopen() + tbl = self.h5file.root.test + + nrarr = np.rec.array(testABuffer, + dtype=tbl.description._v_nested_descr) + # When reading an entire nested column, the output array must contain + # all fields in the table. The output buffer will contain the contents + # of all fields. The selected column alone will be returned from the + # method call. + all_cols = np.empty(1, tbl.dtype) + tblcols = tbl.read(start=0, step=2, field='Info', out=all_cols) + nrarrcols = nrarr['Info'][0::2] + if common.verbose: + print("Read cols:", tblcols) + print("Should look like:", nrarrcols) + self.assertTrue(common.areArraysEqual(nrarrcols, tblcols), + "Original array are retrieved doesn't match.") + self.assertTrue(common.areArraysEqual(nrarr[0::2], all_cols), + "Output buffer does not match full table.") + + def test02_read(self): + """Checking Table.read with a nested Column.""" + + tbl = self.h5file.create_table( + '/', 'test', self._TestTDescr, title=self._getMethodName()) + tbl.append(self._testAData) + + if self.reopen: + self._reopen() + tbl = self.h5file.root.test + + tblcols = tbl.read(start=0, step=2, field='Info/value') + nrarr = np.rec.array(testABuffer, + dtype=tbl.description._v_nested_descr) + nrarrcols = nrarr['Info']['value'][0::2] + self.assertTrue(common.areArraysEqual(nrarrcols, tblcols), + "Original array are retrieved doesn't match.") + + def test02_read_out_arg(self): + """Checking Table.read with a nested Column.""" + + tbl = self.h5file.create_table( + '/', 'test', self._TestTDescr, title=self._getMethodName()) + tbl.append(self._testAData) + + if self.reopen: + self._reopen() + tbl = self.h5file.root.test + + tblcols = np.empty(1, dtype='c16') + tbl.read(start=0, step=2, field='Info/value', out=tblcols) + nrarr = np.rec.array(testABuffer, + dtype=tbl.description._v_nested_descr) + nrarrcols = nrarr['Info']['value'][0::2] + self.assertTrue(common.areArraysEqual(nrarrcols, tblcols), + "Original array are retrieved doesn't match.") + + +class ReadNoReopen(ReadTestCase): + reopen = 0 + + +class ReadReopen(ReadTestCase): + reopen = 1 + + +# Checking the Table.Cols accessor +class ColsTestCase(common.TempFileMixin, common.PyTablesTestCase): + _TestTDescr = TestTDescr + _testABuffer = testABuffer + _testAData = testAData + _testNestedCol = testNestedCol + + def test00a_repr(self): + """Checking string representation of Cols.""" + + tbl = self.h5file.create_table( + '/', 'test', self._TestTDescr, title="test00") + + if self.reopen: + self._reopen() + tbl = self.h5file.root.test + + if common.verbose: + print("str(tbl.cols)-->", str(tbl.cols)) + print("repr(tbl.cols)-->", repr(tbl.cols)) + + self.assertEqual(str(tbl.cols), "/test.cols (Cols), 6 columns") + try: + self.assertEqual(repr(tbl.cols), + """/test.cols (Cols), 6 columns + x (Column(0, 2), ('int32',(2,))) + Info (Cols(), Description) + color (Column(0,), |S2) + info (Cols(), Description) + y (Column(0, 2, 2), ('float64',(2, 2))) + z (Column(0,), uint8) +""" + ) + except AssertionError: + self.assertEqual(repr(tbl.cols), + """/test.cols (Cols), 6 columns + x (Column(0, 2), ('{}', (2,))) + Info (Cols(), Description) + color (Column(0,), |S2) + info (Cols(), Description) + y (Column(0, 2, 2), ('{}', (2, 2))) + z (Column(0,), uint8) +""".format(np.int32(0).dtype.str, np.float64(0).dtype.str)) + + def test00b_repr(self): + """Checking string representation of nested Cols.""" + + tbl = self.h5file.create_table( + '/', 'test', self._TestTDescr, title=self._getMethodName()) + + if self.reopen: + self._reopen() + tbl = self.h5file.root.test + + if common.verbose: + print("str(tbl.cols.Info)-->", str(tbl.cols.Info)) + print("repr(tbl.cols.Info)-->", repr(tbl.cols.Info)) + + self.assertEqual(str( + tbl.cols.Info), "/test.cols.Info (Cols), 5 columns") + self.assertEqual(repr(tbl.cols.Info), + """/test.cols.Info (Cols), 5 columns + value (Column(0,), complex128) + y2 (Column(0,), float64) + Info2 (Cols(), Description) + name (Column(0,), |S2) + z2 (Column(0,), uint8) +""") + + def test01a_f_col(self): + """Checking cols._f_col() with a subgroup.""" + + tbl = self.h5file.create_table( + '/', 'test', self._TestTDescr, title=self._getMethodName()) + + if self.reopen: + self._reopen() + tbl = self.h5file.root.test + + tblcol = tbl.cols._f_col(self._testNestedCol) + if common.verbose: + print("Column group name:", tblcol._v_desc._v_pathname) + self.assertEqual(tblcol._v_desc._v_pathname, self._testNestedCol, + "Column group name doesn't match.") + + def test01b_f_col(self): + """Checking cols._f_col() with a column.""" + + tbl = self.h5file.create_table( + '/', 'test', self._TestTDescr, title=self._getMethodName()) + + if self.reopen: + self._reopen() + tbl = self.h5file.root.test + + tblcol = tbl.cols._f_col(self._testNestedCol + "/name") + if common.verbose: + print("Column name:", tblcol.name) + self.assertEqual(tblcol.name, "name", "Column name doesn't match.") + + def test01c_f_col(self): + """Checking cols._f_col() with a nested subgroup.""" + + tbl = self.h5file.create_table( + '/', 'test', self._TestTDescr, title=self._getMethodName()) + + tblcol = tbl.cols._f_col(self._testNestedCol + "/Info2") + if common.verbose: + print("Column group name:", tblcol._v_desc._v_pathname) + self.assertEqual(tblcol._v_desc._v_pathname, + self._testNestedCol + "/Info2", + "Column group name doesn't match.") + + def test02a__len__(self): + """Checking cols.__len__() in root level.""" + + tbl = self.h5file.create_table( + '/', 'test', self._TestTDescr, title=self._getMethodName()) + + if self.reopen: + self._reopen() + tbl = self.h5file.root.test + + length = len(tbl.cols) + if common.verbose: + print("Column group length:", length) + self.assertEqual(length, len(tbl.colnames), + "Column group length doesn't match.") + + def test02b__len__(self): + """Checking cols.__len__() in subgroup level.""" + + tbl = self.h5file.create_table( + '/', 'test', self._TestTDescr, title=self._getMethodName()) + + if self.reopen: + self._reopen() + tbl = self.h5file.root.test + + length = len(tbl.cols.Info) + if common.verbose: + print("Column group length:", length) + self.assertEqual(length, len(tbl.cols.Info._v_colnames), + "Column group length doesn't match.") + + def test03a__getitem__(self): + """Checking cols.__getitem__() with a single index.""" + + tbl = self.h5file.create_table( + '/', 'test', self._TestTDescr, title=self._getMethodName()) + tbl.append(self._testAData) + + if self.reopen: + self._reopen() + tbl = self.h5file.root.test + + nrarr = np.array(testABuffer, dtype=tbl.description._v_nested_descr) + tblcols = tbl.cols[1] + nrarrcols = nrarr[1] + if common.verbose: + print("Read cols:", tblcols) + print("Should look like:", nrarrcols) + self.assertTrue(common.areArraysEqual(nrarrcols, tblcols), + "Original array are retrieved doesn't match.") + + def test03b__getitem__(self): + """Checking cols.__getitem__() with a range index.""" + + tbl = self.h5file.create_table( + '/', 'test', self._TestTDescr, title=self._getMethodName()) + tbl.append(self._testAData) + + if self.reopen: + self._reopen() + tbl = self.h5file.root.test + + nrarr = np.array(testABuffer, dtype=tbl.description._v_nested_descr) + tblcols = tbl.cols[0:2] + nrarrcols = nrarr[0:2] + if common.verbose: + print("Read cols:", tblcols) + print("Should look like:", nrarrcols) + self.assertTrue(common.areArraysEqual(nrarrcols, tblcols), + "Original array are retrieved doesn't match.") + + def test03c__getitem__(self): + """Checking cols.__getitem__() with a range index with step.""" + + tbl = self.h5file.create_table( + '/', 'test', self._TestTDescr, title=self._getMethodName()) + tbl.append(self._testAData) + + if self.reopen: + self._reopen() + tbl = self.h5file.root.test + + nrarr = np.array(testABuffer, dtype=tbl.description._v_nested_descr) + tblcols = tbl.cols[0::2] + nrarrcols = nrarr[0::2] + if common.verbose: + print("Read cols:", tblcols) + print("Should look like:", nrarrcols) + self.assertTrue(common.areArraysEqual(nrarrcols, tblcols), + "Original array are retrieved doesn't match.") + + def test04a__getitem__(self): + """Checking cols.__getitem__() with subgroups with a single index.""" + + tbl = self.h5file.create_table( + '/', 'test', self._TestTDescr, title=self._getMethodName()) + tbl.append(self._testAData) + + if self.reopen: + self._reopen() + tbl = self.h5file.root.test + + nrarr = np.array(testABuffer, dtype=tbl.description._v_nested_descr) + tblcols = tbl.cols._f_col('Info')[1] + nrarrcols = nrarr['Info'][1] + if common.verbose: + print("Read cols:", tblcols) + print("Should look like:", nrarrcols) + self.assertTrue(common.areArraysEqual(nrarrcols, tblcols), + "Original array are retrieved doesn't match.") + + def test04b__getitem__(self): + """Checking cols.__getitem__() with subgroups with a range index.""" + + tbl = self.h5file.create_table( + '/', 'test', self._TestTDescr, title=self._getMethodName()) + tbl.append(self._testAData) + + if self.reopen: + self._reopen() + tbl = self.h5file.root.test + + nrarr = np.array(testABuffer, dtype=tbl.description._v_nested_descr) + tblcols = tbl.cols._f_col('Info')[0:2] + nrarrcols = nrarr['Info'][0:2] + if common.verbose: + print("Read cols:", tblcols) + print("Should look like:", nrarrcols) + self.assertTrue(common.areArraysEqual(nrarrcols, tblcols), + "Original array are retrieved doesn't match.") + + def test04c__getitem__(self): + """Checking cols.__getitem__() with subgroups with a range index with + step.""" + + tbl = self.h5file.create_table( + '/', 'test', self._TestTDescr, title=self._getMethodName()) + tbl.append(self._testAData) + + if self.reopen: + self._reopen() + tbl = self.h5file.root.test + + nrarr = np.array(testABuffer, dtype=tbl.description._v_nested_descr) + tblcols = tbl.cols._f_col('Info')[0::2] + nrarrcols = nrarr['Info'][0::2] + if common.verbose: + print("Read cols:", tblcols) + print("Should look like:", nrarrcols) + self.assertTrue(common.areArraysEqual(nrarrcols, tblcols), + "Original array are retrieved doesn't match.") + + def test05a__getitem__(self): + """Checking cols.__getitem__() with a column with a single index.""" + + tbl = self.h5file.create_table( + '/', 'test', self._TestTDescr, title=self._getMethodName()) + tbl.append(self._testAData) + + if self.reopen: + self._reopen() + tbl = self.h5file.root.test + + nrarr = np.array(testABuffer, dtype=tbl.description._v_nested_descr) + tblcols = tbl.cols._f_col('Info/value')[1] + nrarrcols = nrarr['Info']['value'][1] + if common.verbose: + print("Read cols:", tblcols) + print("Should look like:", nrarrcols) + self.assertEqual(nrarrcols, tblcols, + "Original array are retrieved doesn't match.") + + def test05b__getitem__(self): + """Checking cols.__getitem__() with a column with a range index.""" + + tbl = self.h5file.create_table( + '/', 'test', self._TestTDescr, title=self._getMethodName()) + tbl.append(self._testAData) + + if self.reopen: + self._reopen() + tbl = self.h5file.root.test + + nrarr = np.array(testABuffer, dtype=tbl.description._v_nested_descr) + tblcols = tbl.cols._f_col('Info/value')[0:2] + nrarrcols = nrarr['Info']['value'][0:2] + if common.verbose: + print("Read cols:", tblcols) + print("Should look like:", nrarrcols) + self.assertTrue(common.areArraysEqual(nrarrcols, tblcols), + "Original array are retrieved doesn't match.") + + def test05c__getitem__(self): + """Checking cols.__getitem__() with a column with a range index with + step.""" + + tbl = self.h5file.create_table( + '/', 'test', self._TestTDescr, title=self._getMethodName()) + tbl.append(self._testAData) + + if self.reopen: + self._reopen() + tbl = self.h5file.root.test + + nrarr = np.array(testABuffer, dtype=tbl.description._v_nested_descr) + tblcols = tbl.cols._f_col('Info/value')[0::2] + nrarrcols = nrarr['Info']['value'][0::2] + if common.verbose: + print("Read cols:", tblcols) + print("Should look like:", nrarrcols) + self.assertTrue(common.areArraysEqual(nrarrcols, tblcols), + "Original array are retrieved doesn't match.") + + def test_01a__iter__(self): + tbl = self.h5file.create_table( + '/', 'test', self._TestTDescr, title=self._getMethodName()) + tbl.append(self._testAData) + + if self.reopen: + self._reopen() + tbl = self.h5file.root.test + + nrarr = np.array(testABuffer, dtype=tbl.description._v_nested_descr) + row_num = 0 + for item in tbl.cols.Info.value: + self.assertEqual(item, nrarr['Info']['value'][row_num]) + row_num += 1 + self.assertEqual(row_num, len(nrarr)) + + +class ColsNoReopen(ColsTestCase): + reopen = 0 + + +class ColsReopen(ColsTestCase): + reopen = 1 + + +class Nested(tb.IsDescription): + uid = tb.IntCol(pos=1) + value = tb.FloatCol(pos=2) + + +class A_Candidate(tb.IsDescription): + nested1 = Nested() + nested2 = Nested() + + +class B_Candidate(tb.IsDescription): + nested1 = Nested + nested2 = Nested + + +class C_Candidate(tb.IsDescription): + nested1 = Nested() + nested2 = Nested + + +Dnested = { + 'uid': tb.IntCol(pos=1), + 'value': tb.FloatCol(pos=2), +} + +D_Candidate = { + "nested1": Dnested, + "nested2": Dnested, +} + +E_Candidate = { + "nested1": Nested, + "nested2": Dnested, +} + +F_Candidate = { + "nested1": Nested(), + "nested2": Dnested, +} + +# Checking several nested columns declared in the same way + + +class SameNestedTestCase(common.TempFileMixin, common.PyTablesTestCase): + correct_names = [ + '', # The root of columns + 'nested1', 'nested1/uid', 'nested1/value', + 'nested2', 'nested2/uid', 'nested2/value', + ] + + def test01a(self): + """Checking same nested columns (instance flavor).""" + + tbl = self.h5file.create_table( + '/', 'test', A_Candidate, title=self._getMethodName()) + + if self.reopen: + self._reopen() + tbl = self.h5file.root.test + + names = [col._v_pathname for col in tbl.description._f_walk( + type="All")] + if common.verbose: + print("Pathnames of columns:", names) + print("Should look like:", self.correct_names) + self.assertEqual(names, self.correct_names, + "Column nested names doesn't match.") + + def test01b(self): + """Checking same nested columns (class flavor).""" + + tbl = self.h5file.create_table( + '/', 'test', B_Candidate, title=self._getMethodName()) + + if self.reopen: + self._reopen() + tbl = self.h5file.root.test + + names = [col._v_pathname for col in tbl.description._f_walk( + type="All")] + if common.verbose: + print("Pathnames of columns:", names) + print("Should look like:", self.correct_names) + self.assertEqual(names, self.correct_names, + "Column nested names doesn't match.") + + def test01c(self): + """Checking same nested columns (mixed instance/class flavor).""" + + tbl = self.h5file.create_table( + '/', 'test', C_Candidate, title=self._getMethodName()) + + if self.reopen: + self._reopen() + tbl = self.h5file.root.test + + names = [col._v_pathname for col in tbl.description._f_walk( + type="All")] + if common.verbose: + print("Pathnames of columns:", names) + print("Should look like:", self.correct_names) + self.assertEqual(names, self.correct_names, + "Column nested names doesn't match.") + + def test01d(self): + """Checking same nested columns (dictionary flavor).""" + + tbl = self.h5file.create_table( + '/', 'test', D_Candidate, title=self._getMethodName()) + + if self.reopen: + self._reopen() + tbl = self.h5file.root.test + + names = [col._v_pathname for col in tbl.description._f_walk( + type="All")] + if common.verbose: + print("Pathnames of columns:", names) + print("Should look like:", self.correct_names) + self.assertEqual(names, self.correct_names, + "Column nested names doesn't match.") + + def test01e(self): + """Checking same nested columns (mixed dictionary/class flavor).""" + + tbl = self.h5file.create_table( + '/', 'test', E_Candidate, title=self._getMethodName()) + + if self.reopen: + self._reopen() + tbl = self.h5file.root.test + + names = [col._v_pathname for col in tbl.description._f_walk( + type="All")] + if common.verbose: + print("Pathnames of columns:", names) + print("Should look like:", self.correct_names) + self.assertEqual(names, self.correct_names, + "Column nested names doesn't match.") + + def test01f(self): + """Checking same nested columns (mixed dictionary/instance flavor).""" + + tbl = self.h5file.create_table( + '/', 'test', F_Candidate, title=self._getMethodName()) + + if self.reopen: + self._reopen() + tbl = self.h5file.root.test + + names = [col._v_pathname for col in tbl.description._f_walk( + type="All")] + if common.verbose: + print("Pathnames of columns:", names) + print("Should look like:", self.correct_names) + self.assertEqual(names, self.correct_names, + "Column nested names doesn't match.") + + def test02a(self): + """Indexing two simple columns under the same nested column.""" + + desc = { + 'nested': { + 'i1': tb.Int32Col(), + 'i2': tb.Int32Col() + } + } + + i1 = 'nested/i1' + i2 = 'nested/i2' + tbl = self.h5file.create_table( + '/', 'test', desc, title=self._getMethodName()) + + row = tbl.row + for i in range(1000): + row[i1] = i + row[i2] = i * 2 + row.append() + tbl.flush() + + cols = { + 'i1': tbl.cols.nested.i1, + 'i2': tbl.cols.nested.i2, + } + cols['i1'].create_index() + cols['i2'].create_index() + + if self.reopen: + self._reopen() + tbl = self.h5file.root.test + # Redefine the cols dictionary + cols = {'i1': tbl.cols.nested.i1, + 'i2': tbl.cols.nested.i2, } + + i1res = [r[i1] for r in tbl.where('i1 < 10', cols)] + i2res = [r[i2] for r in tbl.where('i2 < 10', cols)] + + if common.verbose: + print("Retrieved values (i1):", i1res) + print("Should look like:", list(range(10))) + print("Retrieved values (i2):", i2res) + print("Should look like:", list(range(0, 10, 2))) + + self.assertEqual(i1res, list(range(10)), + "Select for nested column (i1) doesn't match.") + self.assertEqual(i2res, list(range(0, 10, 2)), + "Select for nested column (i2) doesn't match.") + + def test02b(self): + """Indexing two simple columns under the same (very) nested column.""" + + desc = { + 'nested1': { + 'nested2': { + 'nested3': { + 'i1': tb.Int32Col(), + 'i2': tb.Int32Col() + } + } + } + } + + i1 = 'nested1/nested2/nested3/i1' + i2 = 'nested1/nested2/nested3/i2' + + tbl = self.h5file.create_table( + '/', 'test', desc, title=self._getMethodName()) + + row = tbl.row + for i in range(1000): + row[i1] = i + row[i2] = i * 2 + row.append() + tbl.flush() + + cols = {'i1': tbl.cols.nested1.nested2.nested3.i1, + 'i2': tbl.cols.nested1.nested2.nested3.i2, } + cols['i1'].create_index() + cols['i2'].create_index() + + if self.reopen: + self._reopen() + tbl = self.h5file.root.test + # Redefine the cols dictionary + cols = {'i1': tbl.cols.nested1.nested2.nested3.i1, + 'i2': tbl.cols.nested1.nested2.nested3.i2, } + + i1res = [r[i1] for r in tbl.where('i1 < 10', cols)] + i2res = [r[i2] for r in tbl.where('i2 < 10', cols)] + + if common.verbose: + print("Retrieved values (i1):", i1res) + print("Should look like:", list(range(10))) + print("Retrieved values (i2):", i2res) + print("Should look like:", list(range(0, 10, 2))) + + self.assertEqual(i1res, list(range(10)), + "Select for nested column (i1) doesn't match.") + self.assertEqual(i2res, list(range(0, 10, 2)), + "Select for nested column (i2) doesn't match.") + + +class SameNestedNoReopen(SameNestedTestCase): + reopen = 0 + + +class SameNestedReopen(SameNestedTestCase): + reopen = 1 + + +class NestedTypesWithGaps(common.TestFileMixin, common.PyTablesTestCase): + h5fname = common.test_filename('nested-type-with-gaps.h5') + + correct_descr = """{ + "float": Float32Col(shape=(), dflt=0.0, pos=0), + "compound": { + "char": Int8Col(shape=(), dflt=0, pos=0), + "double": Float64Col(shape=(), dflt=0.0, pos=1)}}""" + + def test01(self): + """Opening a table with nested types with gaps.""" + + tbl = self.h5file.get_node('/nestedtype') + type_descr = repr(tbl.description) + if common.verbose: + print("Type size with gaps:", tbl.description._v_itemsize) + print("And should be: 16") + print("Representation of the nested type:\n", type_descr) + print("And should be:\n", self.correct_descr) + print("Here are the offsets: ", tbl.description._v_offsets) + + self.assertEqual(tbl.description._v_itemsize, 16) + self.assertEqual(type_descr, self.correct_descr) + + if common.verbose: + print("Great! Nested types with gaps recognized correctly.") + + +def suite(): + """Return a test suite consisting of all the test cases in the module.""" + + theSuite = common.unittest.TestSuite() + niter = 1 + # common.heavy = 1 # uncomment this only for testing purposes + + for i in range(niter): + theSuite.addTest(common.unittest.makeSuite(DescriptionTestCase)) + theSuite.addTest(common.unittest.makeSuite(CreateTestCase)) + theSuite.addTest(common.unittest.makeSuite(WriteNoReopen)) + theSuite.addTest(common.unittest.makeSuite(WriteReopen)) + theSuite.addTest(common.unittest.makeSuite(ColsNoReopen)) + theSuite.addTest(common.unittest.makeSuite(ColsReopen)) + theSuite.addTest(common.unittest.makeSuite(ReadNoReopen)) + theSuite.addTest(common.unittest.makeSuite(ReadReopen)) + theSuite.addTest(common.unittest.makeSuite(SameNestedNoReopen)) + theSuite.addTest(common.unittest.makeSuite(SameNestedReopen)) + theSuite.addTest(common.unittest.makeSuite(NestedTypesWithGaps)) + + return theSuite + + +if __name__ == '__main__': + common.parse_argv(sys.argv) + common.print_versions() + common.unittest.main(defaultTest='suite') diff --git a/tables/tests/test_numpy.py b/tables/tests/test_numpy.py new file mode 100644 index 0000000..18bac27 --- /dev/null +++ b/tables/tests/test_numpy.py @@ -0,0 +1,1402 @@ +import sys +import tempfile +from pathlib import Path + +import numpy as np + +import tables as tb +from tables.tests import common + + +typecodes = ['b', 'h', 'i', 'l', 'q', 'f', 'd'] +# UInt64 checking disabled on win platforms +# because this type is not supported +if sys.platform != 'win32': + typecodes += ['B', 'H', 'I', 'L', 'Q', 'F', 'D'] +else: + typecodes += ['B', 'H', 'I', 'L', 'F', 'D'] +typecodes += ['b1'] # boolean + +if hasattr(tb, 'Float16Atom'): + typecodes.append('e') +if hasattr(tb, 'Float96Atom') or hasattr(tb, 'Float128Atom'): + typecodes.append('g') +if hasattr(tb, 'Complex192Atom') or hasattr(tb, 'Conplex256Atom'): + typecodes.append('G') + +byteorder = {'little': '<', 'big': '>'}[sys.byteorder] + + +class BasicTestCase(common.PyTablesTestCase): + """Basic test for all the supported typecodes present in NumPy. + + All of them are included on PyTables. + + """ + endiancheck = 0 + + def WriteRead(self, testArray): + if common.verbose: + print('\n', '-=' * 30) + print("Running test for array with typecode '%s'" % + testArray.dtype.char, end=' ') + print("for class check:", self.title) + + # Create an instance of HDF5 Table + self.h5fname = tempfile.mktemp(".h5") + try: + with tb.open_file(self.h5fname, mode="w") as self.h5file: + self.root = self.h5file.root + + # Create the array under root and name 'somearray' + a = testArray + self.h5file.create_array(self.root, 'somearray', a, + "Some array") + + # Re-open the file in read-only mode + with tb.open_file(self.h5fname, mode="r") as self.h5file: + self.root = self.h5file.root + + # Read the saved array + b = self.root.somearray.read() + + # For cases that read returns a python type instead of a + # numpy type + if not hasattr(b, "shape"): + b = np.np.array(b, dtype=a.dtype.str) + + # Compare them. They should be equal. + # if not allequal(a,b, "numpy") and common.verbose: + if common.verbose: + print("Array written:", a) + print("Array written shape:", a.shape) + print("Array written itemsize:", a.itemsize) + print("Array written type:", a.dtype.char) + print("Array read:", b) + print("Array read shape:", b.shape) + print("Array read itemsize:", b.itemsize) + print("Array read type:", b.dtype.char) + + type_ = self.root.somearray.atom.type + + # Check strictly the array equality + self.assertEqual(type(a), type(b)) + self.assertEqual(a.shape, b.shape) + self.assertEqual(a.shape, self.root.somearray.shape) + self.assertEqual(a.dtype, b.dtype) + if a.dtype.char[0] == "S": + self.assertEqual(type_, "string") + else: + self.assertEqual(a.dtype.base.name, type_) + + self.assertTrue(common.allequal(a, b, "numpy")) + finally: + # Then, delete the file + if Path(self.h5fname).is_file(): + Path(self.h5fname).unlink() + + def test00_char(self): + """Data integrity during recovery (character objects)""" + + a = np.array(self.tupleChar, 'S'+str(len(self.tupleChar))) + self.WriteRead(a) + + def test01_char_nc(self): + """Data integrity during recovery (non-contiguous character objects)""" + + a = np.array(self.tupleChar, 'S'+str(len(self.tupleChar))) + if a.shape == (): + b = a # We cannot use the indexing notation + else: + b = a[::2] + # Ensure that this numpy string is non-contiguous + if a.shape[0] > 2: + self.assertEqual(b.flags['CONTIGUOUS'], False) + self.WriteRead(b) + + def test02_types(self): + """Data integrity during recovery (numerical types)""" + + for typecode in typecodes: + if self.tupleInt.shape: + a = self.tupleInt.astype(typecode) + else: + # shape is the empty tuple () + a = np.array(self.tupleInt, dtype=typecode) + self.WriteRead(a) + + def test03_types_nc(self): + """Data integrity during recovery (non-contiguous numerical types)""" + + for typecode in typecodes: + if self.tupleInt.shape: + a = self.tupleInt.astype(typecode) + else: + # shape is the empty tuple () + a = np.array(self.tupleInt, dtype=typecode) + + # This should not be tested for the rank-0 case + if len(a.shape) == 0: + raise common.unittest.SkipTest + b = a[::2] + + # Ensure that this array is non-contiguous (for non-trivial case) + if a.shape[0] > 2: + self.assertEqual(b.flags['CONTIGUOUS'], False) + self.WriteRead(b) + + +class Basic0DOneTestCase(BasicTestCase): + # Rank-0 case + title = "Rank-0 case 1" + tupleInt = np.array(3) + tupleChar = "4" + + +class Basic0DTwoTestCase(BasicTestCase): + # Rank-0 case + title = "Rank-0 case 2" + tupleInt = np.array(33) + tupleChar = "44" + + +class Basic1DOneTestCase(BasicTestCase): + # 1D case + title = "Rank-1 case 1" + tupleInt = np.array((3,)) + tupleChar = ("a",) + + +class Basic1DTwoTestCase(BasicTestCase): + # 1D case + title = "Rank-1 case 2" + tupleInt = np.array((0, 4)) + tupleChar = ("aaa",) + + +class Basic1DThreeTestCase(BasicTestCase): + # 1D case + title = "Rank-1 case 3" + tupleInt = np.array((3, 4, 5)) + tupleChar = ("aaaa", "bbb",) + + +class Basic2DTestCase(BasicTestCase): + # 2D case + title = "Rank-2 case 1" + # tupleInt = reshape(np.array(np.arange((4)**2)), (4,)*2) + tupleInt = np.ones((4,)*2) + tupleChar = [["aaa", "ddddd"], ["d", "ss"], ["s", "tt"]] + + +class Basic10DTestCase(BasicTestCase): + # 10D case + title = "Rank-10 case 1" + # tupleInt = reshape(np.array(np.arange((2)**10)), (2,)*10) + tupleInt = np.ones((2,)*10) + # tupleChar = reshape(np.array([1],dtype="S1"),(1,)*10) + # The next tuple consumes far more time, so this + # test should be run in common.heavy mode. + tupleChar = np.array(tupleInt, dtype="S1") + + +# class Basic32DTestCase(BasicTestCase): +# # 32D case (maximum) +# tupleInt = reshape(np.array((22,)), (1,)*32) +# # Strings seems to be very slow with somewhat large dimensions +# # This should not be run unless the numarray people address this problem +# # F. Alted 2006-01-04 +# tupleChar = np.array(tupleInt, dtype="S1") + + +class GroupsArrayTestCase(common.TempFileMixin, common.PyTablesTestCase): + """This test class checks combinations of arrays with groups. + + It also uses arrays ranks which ranges until 10. + + """ + + def test00_iterativeGroups(self): + """Checking combinations of arrays with groups + + It also uses arrays ranks which ranges until 10. + + """ + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test00_iterativeGroups..." % + self.__class__.__name__) + + # Get the root group + group = self.h5file.root + + i = 1 + for typecode in typecodes: + # Create an array of typecode, with incrementally bigger ranges + a = np.ones((2,) * i, typecode) + # Save it on the HDF5 file + dsetname = 'array_' + typecode + if common.verbose: + print("Creating dataset:", group._g_join(dsetname)) + self.h5file.create_array(group, dsetname, a, "Large array") + # Create a new group + group = self.h5file.create_group(group, 'group' + str(i)) + # increment the range for next iteration + i += 1 + + self._reopen() + + # Get the root group + group = self.h5file.root + + # Get the metadata on the previosly saved arrays + for i in range(1, len(typecodes)): + # Create an array for later comparison + a = np.ones((2,) * i, typecodes[i - 1]) + # Get the dset object hanging from group + dset = getattr(group, 'array_' + typecodes[i-1]) + # Get the actual array + b = dset.read() + if not common.allequal(a, b, "numpy") and common.verbose: + print("Array a original. Shape: ==>", a.shape) + print("Array a original. Data: ==>", a) + print("Info from dataset:", dset._v_pathname) + print(" shape ==>", dset.shape, end=' ') + print(" dtype ==> %s" % dset.dtype) + print("Array b read from file. Shape: ==>", b.shape, end=' ') + print(". Type ==> %s" % b.dtype.char) + + self.assertEqual(a.shape, b.shape) + if np.dtype('l').itemsize == 4: + if (a.dtype.char == "i" or a.dtype.char == "l"): + # Special expection. We have no way to distinguish between + # "l" and "i" typecode, and we can consider them the same + # to all practical effects + self.assertIn(b.dtype.char, ("l", "i")) + elif (a.dtype.char == "I" or a.dtype.char == "L"): + # Special expection. We have no way to distinguish between + # "L" and "I" typecode, and we can consider them the same + # to all practical effects + self.assertIn(b.dtype.char, ("L", "I")) + else: + self.assertTrue(common.allequal(a, b, "numpy")) + elif np.dtype('l').itemsize == 8: + if (a.dtype.char == "q" or a.dtype.char == "l"): + # Special expection. We have no way to distinguish between + # "q" and "l" typecode in 64-bit platforms, and we can + # consider them the same to all practical effects + self.assertIn(b.dtype.char, ("l", "q")) + elif (a.dtype.char == "Q" or a.dtype.char == "L"): + # Special expection. We have no way to distinguish between + # "Q" and "L" typecode in 64-bit platforms, and we can + # consider them the same to all practical effects + self.assertIn(b.dtype.char, ("L", "Q")) + else: + self.assertTrue(common.allequal(a, b, "numpy")) + + # Iterate over the next group + group = getattr(group, 'group' + str(i)) + + def test01_largeRankArrays(self): + """Checking creation of large rank arrays (0 < rank <= 32) + + It also uses arrays ranks which ranges until maxrank. + + """ + + # maximum level of recursivity (deepest group level) achieved: + # maxrank = 32 (for a effective maximum rank of 32) + # This limit is due to a limit in the HDF5 library. + minrank = 1 + maxrank = 32 + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test01_largeRankArrays..." % + self.__class__.__name__) + print("Maximum rank for tested arrays:", maxrank) + + group = self.h5file.root + if common.verbose: + print("Rank array writing progress: ", end=' ') + for rank in range(minrank, maxrank + 1): + # Create an array of integers, with incrementally bigger ranges + a = np.ones((1,) * rank, 'i') + if common.verbose: + print("%3d," % (rank), end=' ') + self.h5file.create_array(group, "array", a, "Rank: %s" % rank) + group = self.h5file.create_group(group, 'group' + str(rank)) + + # Flush the buffers + self.h5file.flush() + + self._reopen() + + group = self.h5file.root + if common.verbose: + print() + print("Rank array reading progress: ") + # Get the metadata on the previosly saved arrays + for rank in range(minrank, maxrank + 1): + # Create an array for later comparison + a = np.ones((1,) * rank, 'i') + # Get the actual array + b = group.array.read() + if common.verbose: + print("%3d," % (rank), end=' ') + if not a.tolist() == b.tolist() and common.verbose: + dset = group.array + print("Info from dataset:", dset._v_pathname) + print(" Shape: ==>", dset.shape, end=' ') + print(" typecode ==> %c" % dset.typecode) + print("Array b read from file. Shape: ==>", b.shape, end=' ') + print(". Type ==> %c" % b.dtype.char) + self.assertEqual(a.shape, b.shape) + if a.dtype.char == "i": + # Special expection. We have no way to distinguish between + # "l" and "i" typecode, and we can consider them the same + # to all practical effects + self.assertIn(b.dtype.char, ("l", "i")) + else: + self.assertEqual(a.dtype.char, b.dtype.char) + + self.assertEqual(a, b) + + # Iterate over the next group + group = self.h5file.get_node(group, 'group' + str(rank)) + + if common.verbose: + print() # This flush the stdout buffer + + +# Test Record class +class Record(tb.IsDescription): + var1 = tb.StringCol(itemsize=4, dflt=b"abcd", pos=0) + var2 = tb.StringCol(itemsize=1, dflt=b"a", pos=1) + var3 = tb.BoolCol(dflt=1) + var4 = tb.Int8Col(dflt=1) + var5 = tb.UInt8Col(dflt=1) + var6 = tb.Int16Col(dflt=1) + var7 = tb.UInt16Col(dflt=1) + var8 = tb.Int32Col(dflt=1) + var9 = tb.UInt32Col(dflt=1) + var10 = tb.Int64Col(dflt=1) + var11 = tb.Float32Col(dflt=1.0) + var12 = tb.Float64Col(dflt=1.0) + var13 = tb.ComplexCol(itemsize=8, dflt=(1.+0.j)) + var14 = tb.ComplexCol(itemsize=16, dflt=(1.+0.j)) + if hasattr(tb, 'Float16Col'): + var15 = tb.Float16Col(dflt=1.0) + if hasattr(tb, 'Float96Col'): + var16 = tb.Float96Col(dflt=1.0) + if hasattr(tb, 'Float128Col'): + var17 = tb.Float128Col(dflt=1.0) + if hasattr(tb, 'Complex196Col'): + var18 = tb.ComplexCol(itemsize=24, dflt=(1.+0.j)) + if hasattr(tb, 'Complex256Col'): + var19 = tb.ComplexCol(itemsize=32, dflt=(1.+0.j)) + + +class TableReadTestCase(common.TempFileMixin, common.PyTablesTestCase): + nrows = 100 + + def setUp(self): + super().setUp() + + # Create an instance of an HDF5 Table + table = self.h5file.create_table(self.h5file.root, 'table', Record) + for i in range(self.nrows): + table.row.append() # Fill 100 rows with default values + + self._reopen(mode='a') + + def test01_readTableChar(self): + """Checking column conversion into NumPy in read(). + + Char flavor + + """ + + table = self.h5file.root.table + table.flavor = "numpy" + for colname in table.colnames: + numcol = table.read(field=colname) + typecol = table.coltypes[colname] + itemsizecol = table.description._v_dtypes[colname].base.itemsize + nctypecode = numcol.dtype.char + if typecol == "string": + if itemsizecol > 1: + orignumcol = np.array(['abcd']*self.nrows, dtype='S4') + else: + orignumcol = np.array(['a']*self.nrows, dtype='S1') + if common.verbose: + print("Typecode of NumPy column read:", nctypecode) + print("Should look like:", 'c') + print("Itemsize of column:", itemsizecol) + print("Shape of NumPy column read:", numcol.shape) + print("Should look like:", orignumcol.shape) + print("First 3 elements of read col:", numcol[:3]) + # Check that both NumPy objects are equal + self.assertTrue(common.allequal(numcol, orignumcol, "numpy")) + + def test01_readTableNum(self): + """Checking column conversion into NumPy in read(). + + NumPy flavor + + """ + + table = self.h5file.root.table + table.flavor = "numpy" + for colname in table.colnames: + numcol = table.read(field=colname) + typecol = table.coltypes[colname] + nctypecode = np.sctypeDict[numcol.dtype.char[0]] + if typecol != "string": + if common.verbose: + print("Typecode of NumPy column read:", nctypecode) + print("Should look like:", typecol) + orignumcol = np.ones(shape=self.nrows, dtype=numcol.dtype.char) + # Check that both NumPy objects are equal + self.assertTrue(common.allequal(numcol, orignumcol, "numpy")) + + def test02_readCoordsChar(self): + """Column conversion into NumPy in readCoords(). + + Chars + + """ + + table = self.h5file.root.table + table.flavor = "numpy" + coords = [1, 2, 3] + self.nrows = len(coords) + for colname in table.colnames: + numcol = table.read_coordinates(coords, field=colname) + typecol = table.coltypes[colname] + itemsizecol = table.description._v_dtypes[colname].base.itemsize + nctypecode = numcol.dtype.char + if typecol == "string": + if itemsizecol > 1: + orignumcol = np.array(['abcd']*self.nrows, dtype='S4') + else: + orignumcol = np.array(['a']*self.nrows, dtype='S1') + if common.verbose: + print("Typecode of NumPy column read:", nctypecode) + print("Should look like:", 'c') + print("Itemsize of column:", itemsizecol) + print("Shape of NumPy column read:", numcol.shape) + print("Should look like:", orignumcol.shape) + print("First 3 elements of read col:", numcol[:3]) + # Check that both NumPy objects are equal + self.assertTrue(common.allequal(numcol, orignumcol, "numpy")) + + def test02_readCoordsNum(self): + """Column conversion into NumPy in read_coordinates(). + + NumPy. + + """ + + table = self.h5file.root.table + table.flavor = "numpy" + coords = [1, 2, 3] + self.nrows = len(coords) + for colname in table.colnames: + numcol = table.read_coordinates(coords, field=colname) + typecol = table.coltypes[colname] + type_ = numcol.dtype.type + if typecol != "string": + if typecol == "int64": + return + if common.verbose: + print("Type of read NumPy column:", type_) + print("Should look like:", typecol) + orignumcol = np.ones(shape=self.nrows, dtype=numcol.dtype.char) + # Check that both NumPy objects are equal + self.assertTrue(common.allequal(numcol, orignumcol, "numpy")) + + def test03_getIndexNumPy(self): + """Getting table rows specifyied as NumPy scalar integers.""" + + table = self.h5file.root.table + coords = np.array([1, 2, 3], dtype='int8') + for colname in table.colnames: + numcol = [table[coord][colname] for coord in coords] + typecol = table.coltypes[colname] + if typecol != "string": + if typecol == "int64": + return + numcol = np.array(numcol, typecol) + if common.verbose: + type_ = numcol.dtype.type + print("Type of read NumPy column:", type_) + print("Should look like:", typecol) + orignumcol = np.ones(shape=len(numcol), + dtype=numcol.dtype.char) + # Check that both NumPy objects are equal + self.assertTrue(common.allequal(numcol, orignumcol, "numpy")) + + def test04_setIndexNumPy(self): + """Setting table rows specifyied as NumPy integers.""" + + self._reopen(mode='a') + table = self.h5file.root.table + table.flavor = "numpy" + coords = np.array([1, 2, 3], dtype='int8') + # Modify row 1 + # From PyTables 2.0 on, assignments to records can be done + # only as tuples (see http://projects.scipy.org/scipy/numpy/ticket/315) + # table[coords[0]] = ["aasa","x"]+[232]*12 + + n = len(Record.columns) - 2 + + table[coords[0]] = tuple(["aasa", "x"]+[232]*n) # XXX + # record = list(table[coords[0]]) + record = table.read(coords[0], coords[0] + 1) + if common.verbose: + print("Original row:\n" + "['aasa', 'x', True, -24, 232, 232, 232, 232, 232L, " + "232, 232.0, 232.0, (232 + 0j), (232+0j), 232.0, " + "(232+0j)]\n") + print("Read row:\n", record) + self.assertEqual(record['var1'], b'aasa') + self.assertEqual(record['var2'], b'x') + self.assertEqual(record['var3'], True) + self.assertEqual(record['var4'], -24) + self.assertEqual(record['var7'], 232) + + +# The declaration of the nested table: +class Info(tb.IsDescription): + _v_pos = 3 + Name = tb.StringCol(itemsize=2) + Value = tb.ComplexCol(itemsize=16) + + +class TestTDescr(tb.IsDescription): + + """A description that has several nested columns.""" + + x = tb.Int32Col(dflt=0, shape=2, pos=0) # 0 + y = tb.FloatCol(dflt=1, shape=(2, 2)) + z = tb.UInt8Col(dflt=1) + z3 = tb.EnumCol({'r': 4, 'g': 2, 'b': 1}, 'r', 'int32', shape=2) + color = tb.StringCol(itemsize=4, dflt=b"ab", pos=2) + info = Info() + + class Info(tb.IsDescription): # 1 + _v_pos = 1 + name = tb.StringCol(itemsize=2) + value = tb.ComplexCol(itemsize=16, pos=0) # 0 + y2 = tb.FloatCol(pos=1) # 1 + z2 = tb.UInt8Col() + + class Info2(tb.IsDescription): + y3 = tb.Time64Col(shape=2) + name = tb.StringCol(itemsize=2) + value = tb.ComplexCol(itemsize=16, shape=2) + + +class TableNativeFlavorTestCase(common.TempFileMixin, common.PyTablesTestCase): + nrows = 100 + + def setUp(self): + super().setUp() + + # Create an instance of an HDF5 Table + table = self.h5file.create_table(self.h5file.root, 'table', TestTDescr, + expectedrows=self.nrows) + table.flavor = "numpy" + for i in range(self.nrows): + table.row.append() # Fill 100 rows with default values + table.flush() + + def test01a_basicTableRead(self): + """Checking the return of a NumPy in read().""" + + if self.close: + self._reopen(mode='a') + table = self.h5file.root.table + data = table[:] + if common.verbose: + print("Type of read:", type(data)) + print("Description of the record:", data.dtype.descr) + print("First 3 elements of read:", data[:3]) + + # Check that both NumPy objects are equal + self.assertIsInstance(data, np.ndarray) + + # Check the value of some columns + # A flat column + col = table.cols.x[:3] + self.assertIsInstance(col, np.ndarray) + npcol = np.zeros((3, 2), dtype="int32") + self.assertTrue(common.allequal(col, npcol, "numpy")) + + # A nested column + col = table.cols.Info[:3] + self.assertIsInstance(col, np.ndarray) + dtype = [('value', 'c16'), + ('y2', 'f8'), + ('Info2', + [('name', 'S2'), + ('value', 'c16', (2,)), + ('y3', 'f8', (2,))]), + ('name', 'S2'), + ('z2', 'u1')] + npcol = np.zeros((3,), dtype=dtype) + self.assertEqual(col.dtype.descr, npcol.dtype.descr) + if common.verbose: + print("col-->", col) + print("npcol-->", npcol) + + # A copy() is needed in case the buffer can be in different segments + self.assertEqual(bytes(col.copy().data), bytes(npcol.data)) + + def test01b_basicTableRead(self): + """Checking the return of a NumPy in read() (strided version).""" + + if self.close: + self._reopen(mode='a') + table = self.h5file.root.table + data = table[::3] + if common.verbose: + print("Type of read:", type(data)) + print("Description of the record:", data.dtype.descr) + print("First 3 elements of read:", data[:3]) + + # Check that both NumPy objects are equal + self.assertIsInstance(data, np.ndarray) + + # Check the value of some columns + # A flat column + col = table.cols.x[:9:3] + self.assertIsInstance(col, np.ndarray) + npcol = np.zeros((3, 2), dtype="int32") + self.assertTrue(common.allequal(col, npcol, "numpy")) + + # A nested column + col = table.cols.Info[:9:3] + self.assertIsInstance(col, np.ndarray) + dtype = [('value', '%sc16' % byteorder), + ('y2', '%sf8' % byteorder), + ('Info2', + [('name', '|S2'), + ('value', '%sc16' % byteorder, (2,)), + ('y3', '%sf8' % byteorder, (2,))]), + ('name', '|S2'), + ('z2', '|u1')] + npcol = np.zeros((3,), dtype=dtype) + self.assertEqual(col.dtype.descr, npcol.dtype.descr) + if common.verbose: + print("col-->", col) + print("npcol-->", npcol) + + # A copy() is needed in case the buffer can be in different segments + self.assertEqual(bytes(col.copy().data), bytes(npcol.data)) + + def test02_getWhereList(self): + """Checking the return of NumPy in get_where_list method.""" + + if self.close: + self._reopen(mode='a') + table = self.h5file.root.table + data = table.get_where_list('z == 1') + if common.verbose: + print("Type of read:", type(data)) + print("Description of the record:", data.dtype.descr) + print("First 3 elements of read:", data[:3]) + + # Check that both NumPy objects are equal + self.assertIsInstance(data, np.ndarray) + + # Check that all columns have been selected + self.assertEqual(len(data), 100) + + # Finally, check that the contents are ok + self.assertTrue(common.allequal( + data, np.arange(100, dtype="i8"), "numpy")) + + def test03a_readWhere(self): + """Checking the return of NumPy in read_where method (strings).""" + + table = self.h5file.root.table + table.cols.color.create_index() + if self.close: + self._reopen(mode='a') + table = self.h5file.root.table + data = table.read_where('color == b"ab"') + if common.verbose: + print("Type of read:", type(data)) + print("Length of the data read:", len(data)) + + # Check that both NumPy objects are equal + self.assertIsInstance(data, np.ndarray) + + # Check that all columns have been selected + self.assertEqual(len(data), self.nrows) + + def test03b_readWhere(self): + """Checking the return of NumPy in read_where method (numeric).""" + + table = self.h5file.root.table + table.cols.z.create_index() + if self.close: + self._reopen(mode='a') + table = self.h5file.root.table + data = table.read_where('z == 0') + if common.verbose: + print("Type of read:", type(data)) + print("Length of the data read:", len(data)) + + # Check that both NumPy objects are equal + self.assertIsInstance(data, np.ndarray) + + # Check that all columns have been selected + self.assertEqual(len(data), 0) + + def test04a_createTable(self): + """Checking the Table creation from a numpy recarray.""" + + dtype = [('value', '%sc16' % byteorder), + ('y2', '%sf8' % byteorder), + ('Info2', + [('name', '|S2'), + ('value', '%sc16' % byteorder, (2,)), + ('y3', '%sf8' % byteorder, (2,))]), + ('name', '|S2'), + ('z2', '|u1')] + npdata = np.zeros((3,), dtype=dtype) + table = self.h5file.create_table(self.h5file.root, 'table2', npdata) + if self.close: + self._reopen(mode='a') + table = self.h5file.root.table2 + data = table[:] + if common.verbose: + print("Type of read:", type(data)) + print("Description of the record:", data.dtype.descr) + print("First 3 elements of read:", data[:3]) + print("Length of the data read:", len(data)) + + # Check that both NumPy objects are equal + self.assertIsInstance(data, np.ndarray) + + # Check the type + self.assertEqual(data.dtype.descr, npdata.dtype.descr) + if common.verbose: + print("npdata-->", npdata) + print("data-->", data) + + # A copy() is needed in case the buffer would be in different segments + self.assertEqual(bytes(data.copy().data), bytes(npdata.data)) + + def test04b_appendTable(self): + """Checking appending a numpy recarray.""" + + table = self.h5file.root.table + npdata = table[3:6] + table.append(npdata) + if self.close: + self._reopen(mode='a') + table = self.h5file.root.table + data = table[-3:] + if common.verbose: + print("Type of read:", type(data)) + print("Description of the record:", data.dtype.descr) + print("Last 3 elements of read:", data[-3:]) + print("Length of the data read:", len(data)) + + # Check that both NumPy objects are equal + self.assertIsInstance(data, np.ndarray) + + # Check the type + self.assertEqual(data.dtype.descr, npdata.dtype.descr) + if common.verbose: + print("npdata-->", npdata) + print("data-->", data) + + # A copy() is needed in case the buffer would be in different segments + self.assertEqual(bytes(data.copy().data), bytes(npdata.data)) + + def test05a_assignColumn(self): + """Checking assigning to a column.""" + + table = self.h5file.root.table + table.cols.z[:] = np.zeros((100,), dtype='u1') + if self.close: + self._reopen(mode='a') + table = self.h5file.root.table + data = table.cols.z[:] + if common.verbose: + print("Type of read:", type(data)) + print("Description of the record:", data.dtype.descr) + print("First 3 elements of read:", data[:3]) + print("Length of the data read:", len(data)) + + # Check that both NumPy objects are equal + self.assertIsInstance(data, np.ndarray) + + # Check that all columns have been selected + self.assertEqual(len(data), 100) + + # Finally, check that the contents are ok + self.assertTrue(common.allequal( + data, np.zeros((100,), dtype="u1"), "numpy")) + + def test05b_modifyingColumns(self): + """Checking modifying several columns at once.""" + + table = self.h5file.root.table + xcol = np.ones((3, 2), 'int32') + ycol = np.zeros((3, 2, 2), 'float64') + zcol = np.zeros((3,), 'uint8') + table.modify_columns(3, 6, 1, [xcol, ycol, zcol], ['x', 'y', 'z']) + if self.close: + self._reopen(mode='a') + table = self.h5file.root.table + data = table.cols.y[3:6] + if common.verbose: + print("Type of read:", type(data)) + print("Description of the record:", data.dtype.descr) + print("First 3 elements of read:", data[:3]) + print("Length of the data read:", len(data)) + + # Check that both NumPy objects are equal + self.assertIsInstance(data, np.ndarray) + + # Check the type + self.assertEqual(data.dtype.descr, ycol.dtype.descr) + if common.verbose: + print("ycol-->", ycol) + print("data-->", data) + + # A copy() is needed in case the buffer would be in different segments + self.assertEqual(data.copy().data, ycol.data) + + def test05c_modifyingColumns(self): + """Checking modifying several columns using a single numpy buffer.""" + + table = self.h5file.root.table + dtype = [('x', 'i4', (2,)), ('y', 'f8', (2, 2)), ('z', 'u1')] + nparray = np.zeros((3,), dtype=dtype) + table.modify_columns(3, 6, 1, nparray, ['x', 'y', 'z']) + if self.close: + self._reopen(mode='a') + table = self.h5file.root.table + ycol = np.zeros((3, 2, 2), 'float64') + data = table.cols.y[3:6] + if common.verbose: + print("Type of read:", type(data)) + print("Description of the record:", data.dtype.descr) + print("First 3 elements of read:", data[:3]) + print("Length of the data read:", len(data)) + + # Check that both NumPy objects are equal + self.assertIsInstance(data, np.ndarray) + + # Check the type + self.assertEqual(data.dtype.descr, ycol.dtype.descr) + if common.verbose: + print("ycol-->", ycol) + print("data-->", data) + + # A copy() is needed in case the buffer would be in different segments + self.assertEqual(data.copy().data, ycol.data) + + def test06a_assignNestedColumn(self): + """Checking assigning a nested column (using modify_column).""" + + table = self.h5file.root.table + dtype = [('value', '%sc16' % byteorder), + ('y2', '%sf8' % byteorder), + ('Info2', + [('name', '|S2'), + ('value', '%sc16' % byteorder, (2,)), + ('y3', '%sf8' % byteorder, (2,))]), + ('name', '|S2'), + ('z2', '|u1')] + npdata = np.zeros((3,), dtype=dtype) + data = table.cols.Info[3:6] + table.modify_column(3, 6, 1, column=npdata, colname='Info') + if self.close: + self._reopen(mode='a') + table = self.h5file.root.table + data = table.cols.Info[3:6] + if common.verbose: + print("Type of read:", type(data)) + print("Description of the record:", data.dtype.descr) + print("First 3 elements of read:", data[:3]) + print("Length of the data read:", len(data)) + + # Check that both NumPy objects are equal + self.assertIsInstance(data, np.ndarray) + + # Check the type + self.assertEqual(data.dtype.descr, npdata.dtype.descr) + if common.verbose: + print("npdata-->", npdata) + print("data-->", data) + + # A copy() is needed in case the buffer would be in different segments + self.assertEqual(bytes(data.copy().data), bytes(npdata.data)) + + def test06b_assignNestedColumn(self): + """Checking assigning a nested column (using the .cols accessor).""" + + table = self.h5file.root.table + dtype = [('value', '%sc16' % byteorder), + ('y2', '%sf8' % byteorder), + ('Info2', + [('name', '|S2'), + ('value', '%sc16' % byteorder, (2,)), + ('y3', '%sf8' % byteorder, (2,))]), + ('name', '|S2'), + ('z2', '|u1')] + npdata = np.zeros((3,), dtype=dtype) + # self.assertRaises(NotImplementedError, + # table.cols.Info.__setitem__, slice(3,6,1), npdata) + table.cols.Info[3:6] = npdata + if self.close: + self._reopen(mode='a') + table = self.h5file.root.table + data = table.cols.Info[3:6] + if common.verbose: + print("Type of read:", type(data)) + print("Description of the record:", data.dtype.descr) + print("First 3 elements of read:", data[:3]) + print("Length of the data read:", len(data)) + + # Check that both NumPy objects are equal + self.assertIsInstance(data, np.ndarray) + + # Check the type + self.assertEqual(data.dtype.descr, npdata.dtype.descr) + if common.verbose: + print("npdata-->", npdata) + print("data-->", data) + + # A copy() is needed in case the buffer would be in different segments + self.assertEqual(bytes(data.copy().data), bytes(npdata.data)) + + def test07a_modifyingRows(self): + """Checking modifying several rows at once (using modify_rows).""" + + table = self.h5file.root.table + + # Read a chunk of the table + chunk = table[0:3] + + # Modify it somewhat + chunk['y'][:] = -1 + table.modify_rows(3, 6, 1, rows=chunk) + if self.close: + self._reopen(mode='a') + table = self.h5file.root.table + ycol = np.zeros((3, 2, 2), 'float64')-1 + data = table.cols.y[3:6] + if common.verbose: + print("Type of read:", type(data)) + print("Description of the record:", data.dtype.descr) + print("First 3 elements of read:", data[:3]) + print("Length of the data read:", len(data)) + + # Check that both NumPy objects are equal + self.assertIsInstance(data, np.ndarray) + + # Check the type + self.assertEqual(data.dtype.descr, ycol.dtype.descr) + if common.verbose: + print("ycol-->", ycol) + print("data-->", data) + self.assertTrue(common.allequal(ycol, data, "numpy")) + + def test07b_modifyingRows(self): + """Checking modifying several rows at once (using cols accessor).""" + + table = self.h5file.root.table + + # Read a chunk of the table + chunk = table[0:3] + + # Modify it somewhat + chunk['y'][:] = -1 + table.cols[3:6] = chunk + if self.close: + self._reopen(mode='a') + table = self.h5file.root.table + + # Check that some column has been actually modified + ycol = np.zeros((3, 2, 2), 'float64')-1 + data = table.cols.y[3:6] + if common.verbose: + print("Type of read:", type(data)) + print("Description of the record:", data.dtype.descr) + print("First 3 elements of read:", data[:3]) + print("Length of the data read:", len(data)) + + # Check that both NumPy objects are equal + self.assertIsInstance(data, np.ndarray) + + # Check the type + self.assertEqual(data.dtype.descr, ycol.dtype.descr) + if common.verbose: + print("ycol-->", ycol) + print("data-->", data) + self.assertTrue(common.allequal(ycol, data, "numpy")) + + def test08a_modifyingRows(self): + """Checking modifying just one row at once (using modify_rows).""" + + table = self.h5file.root.table + + # Read a chunk of the table + chunk = table[3:4] + + # Modify it somewhat + chunk['y'][:] = -1 + table.modify_rows(6, 7, 1, chunk) + if self.close: + self._reopen(mode='a') + table = self.h5file.root.table + + # Check that some column has been actually modified + ycol = np.zeros((2, 2), 'float64')-1 + data = table.cols.y[6] + if common.verbose: + print("Type of read:", type(data)) + print("Description of the record:", data.dtype.descr) + print("First 3 elements of read:", data[:3]) + print("Length of the data read:", len(data)) + + # Check that both NumPy objects are equal + self.assertIsInstance(data, np.ndarray) + + # Check the type + self.assertEqual(data.dtype.descr, ycol.dtype.descr) + if common.verbose: + print("ycol-->", ycol) + print("data-->", data) + self.assertTrue(common.allequal(ycol, data, "numpy")) + + def test08b_modifyingRows(self): + """Checking modifying just one row at once (using cols accessor).""" + + table = self.h5file.root.table + + # Read a chunk of the table + chunk = table[3:4] + + # Modify it somewhat + chunk['y'][:] = -1 + table.cols[6] = chunk + if self.close: + self._reopen(mode='a') + table = self.h5file.root.table + + # Check that some column has been actually modified + ycol = np.zeros((2, 2), 'float64')-1 + data = table.cols.y[6] + if common.verbose: + print("Type of read:", type(data)) + print("Description of the record:", data.dtype.descr) + print("First 3 elements of read:", data[:3]) + print("Length of the data read:", len(data)) + + # Check that both NumPy objects are equal + self.assertIsInstance(data, np.ndarray) + + # Check the type + self.assertEqual(data.dtype.descr, ycol.dtype.descr) + if common.verbose: + print("ycol-->", ycol) + print("data-->", data) + self.assertTrue(common.allequal(ycol, data, "numpy")) + + def test09a_getStrings(self): + """Checking the return of string columns with spaces.""" + + if self.close: + self._reopen(mode='a') + table = self.h5file.root.table + rdata = table.get_where_list('color == b"ab"') + data = table.read_coordinates(rdata) + if common.verbose: + print("Type of read:", type(data)) + print("Description of the record:", data.dtype.descr) + print("First 3 elements of read:", data[:3]) + + # Check that both NumPy objects are equal + self.assertIsInstance(data, np.ndarray) + + # Check that all columns have been selected + self.assertEqual(len(data), 100) + + # Finally, check that the contents are ok + for idata in data['color']: + self.assertEqual(idata, np.array("ab", dtype="|S4")) + + def test09b_getStrings(self): + """Checking the return of string columns with spaces. + + (modify) + + """ + + if self.close: + self._reopen(mode='a') + table = self.h5file.root.table + for i in range(50): + table.cols.color[i] = "a " + table.flush() + data = table[:] + if common.verbose: + print("Type of read:", type(data)) + print("Description of the record:", data.dtype.descr) + print("First 3 elements of read:", data[:3]) + + # Check that both NumPy objects are equal + self.assertIsInstance(data, np.ndarray) + + # Check that all columns have been selected + self.assertEqual(len(data), 100) + + # Finally, check that the contents are ok + for i in range(100): + idata = data['color'][i] + if i >= 50: + self.assertEqual(idata, np.array("ab", dtype="|S4")) + else: + self.assertEqual(idata, np.array("a ", dtype="|S4")) + + def test09c_getStrings(self): + """Checking the return of string columns with spaces. + + (append) + + """ + + if self.close: + self._reopen(mode='a') + table = self.h5file.root.table + row = table.row + for i in range(50): + row["color"] = "a " # note the trailing spaces + row.append() + table.flush() + if self.close: + self.h5file.close() + self.h5file = tb.open_file(self.h5fname, "a") + data = self.h5file.root.table[:] + if common.verbose: + print("Type of read:", type(data)) + print("Description of the record:", data.dtype.descr) + print("First 3 elements of read:", data[:3]) + + # Check that both NumPy objects are equal + self.assertIsInstance(data, np.ndarray) + + # Check that all columns have been selected + self.assertEqual(len(data), 150) + + # Finally, check that the contents are ok + for i in range(150): + idata = data['color'][i] + if i < 100: + self.assertEqual(idata, np.array("ab", dtype="|S4")) + else: + self.assertEqual(idata, np.array("a ", dtype="|S4")) + + +class TableNativeFlavorOpenTestCase(TableNativeFlavorTestCase): + close = False + + +class TableNativeFlavorCloseTestCase(TableNativeFlavorTestCase): + close = True + + +class AttributesTestCase(common.TempFileMixin, common.PyTablesTestCase): + def setUp(self): + super().setUp() + + # Create an instance of an HDF5 Table + self.h5file.create_group(self.h5file.root, 'group') + + def test01_writeAttribute(self): + """Checking the creation of a numpy attribute.""" + + group = self.h5file.root.group + g_attrs = group._v_attrs + g_attrs.numpy1 = np.zeros((1, 1), dtype='int16') + if self.close: + self._reopen(mode='a') + group = self.h5file.root.group + g_attrs = group._v_attrs + + # Check that we can retrieve a numpy object + data = g_attrs.numpy1 + npcomp = np.zeros((1, 1), dtype='int16') + + # Check that both NumPy objects are equal + self.assertIsInstance(data, np.ndarray) + + # Check the type + self.assertEqual(data.dtype.descr, npcomp.dtype.descr) + if common.verbose: + print("npcomp-->", npcomp) + print("data-->", data) + self.assertTrue(common.allequal(npcomp, data, "numpy")) + + def test02_updateAttribute(self): + """Checking the modification of a numpy attribute.""" + + group = self.h5file.root.group + g_attrs = group._v_attrs + g_attrs.numpy1 = np.zeros((1, 2), dtype='int16') + if self.close: + self._reopen(mode='a') + group = self.h5file.root.group + g_attrs = group._v_attrs + + # Update this attribute + g_attrs.numpy1 = np.ones((1, 2), dtype='int16') + + # Check that we can retrieve a numpy object + data = g_attrs.numpy1 + npcomp = np.ones((1, 2), dtype='int16') + + # Check that both NumPy objects are equal + self.assertIsInstance(data, np.ndarray) + + # Check the type + self.assertEqual(data.dtype.descr, npcomp.dtype.descr) + if common.verbose: + print("npcomp-->", npcomp) + print("data-->", data) + self.assertTrue(common.allequal(npcomp, data, "numpy")) + + +class AttributesOpenTestCase(AttributesTestCase): + close = 0 + + +class AttributesCloseTestCase(AttributesTestCase): + close = 1 + + +class StrlenTestCase(common.TempFileMixin, common.PyTablesTestCase): + def setUp(self): + super().setUp() + + # Create an instance of an HDF5 Table + group = self.h5file.create_group(self.h5file.root, 'group') + tablelayout = {'Text': tb.StringCol(itemsize=1000), } + self.table = self.h5file.create_table(group, 'table', tablelayout) + self.table.flavor = 'numpy' + row = self.table.row + row['Text'] = 'Hello Francesc!' # XXX: check unicode --> bytes + row.append() + row['Text'] = 'Hola Francesc!' # XXX: check unicode --> bytes + row.append() + self.table.flush() + + def test01(self): + """Checking the lengths of strings (read field).""" + + if self.close: + self._reopen(mode='a') + self.table = self.h5file.root.group.table + + # Get both strings + str1 = self.table.col('Text')[0] + str2 = self.table.col('Text')[1] + if common.verbose: + print("string1-->", str1) + print("string2-->", str2) + + # Check that both NumPy objects are equal + self.assertEqual(len(str1), len(b'Hello Francesc!')) + self.assertEqual(len(str2), len(b'Hola Francesc!')) + self.assertEqual(str1, b'Hello Francesc!') + self.assertEqual(str2, b'Hola Francesc!') + + def test02(self): + """Checking the lengths of strings (read recarray).""" + + if self.close: + self._reopen(mode='a') + self.table = self.h5file.root.group.table + + # Get both strings + str1 = self.table[:]['Text'][0] + str2 = self.table[:]['Text'][1] + + # Check that both NumPy objects are equal + self.assertEqual(len(str1), len(b'Hello Francesc!')) + self.assertEqual(len(str2), len(b'Hola Francesc!')) + self.assertEqual(str1, b'Hello Francesc!') + self.assertEqual(str2, b'Hola Francesc!') + + def test03(self): + """Checking the lengths of strings (read recarray, row by row).""" + + if self.close: + self._reopen(mode='a') + self.table = self.h5file.root.group.table + + # Get both strings + str1 = self.table[0]['Text'] + str2 = self.table[1]['Text'] + + # Check that both NumPy objects are equal + self.assertEqual(len(str1), len(b'Hello Francesc!')) + self.assertEqual(len(str2), len(b'Hola Francesc!')) + self.assertEqual(str1, b'Hello Francesc!') + self.assertEqual(str2, b'Hola Francesc!') + + +class StrlenOpenTestCase(StrlenTestCase): + close = 0 + + +class StrlenCloseTestCase(StrlenTestCase): + close = 1 + + +def suite(): + theSuite = common.unittest.TestSuite() + niter = 1 + + # theSuite.addTest(unittest.makeSuite(StrlenOpenTestCase)) + # theSuite.addTest(unittest.makeSuite(Basic0DOneTestCase)) + # theSuite.addTest(unittest.makeSuite(GroupsArrayTestCase)) + for i in range(niter): + theSuite.addTest(common.unittest.makeSuite(Basic0DOneTestCase)) + theSuite.addTest(common.unittest.makeSuite(Basic0DTwoTestCase)) + theSuite.addTest(common.unittest.makeSuite(Basic1DOneTestCase)) + theSuite.addTest(common.unittest.makeSuite(Basic1DTwoTestCase)) + theSuite.addTest(common.unittest.makeSuite(Basic1DThreeTestCase)) + theSuite.addTest(common.unittest.makeSuite(Basic2DTestCase)) + theSuite.addTest(common.unittest.makeSuite(GroupsArrayTestCase)) + theSuite.addTest(common.unittest.makeSuite(TableReadTestCase)) + theSuite.addTest( + common.unittest.makeSuite(TableNativeFlavorOpenTestCase)) + theSuite.addTest( + common.unittest.makeSuite(TableNativeFlavorCloseTestCase)) + theSuite.addTest(common.unittest.makeSuite(AttributesOpenTestCase)) + theSuite.addTest(common.unittest.makeSuite(AttributesCloseTestCase)) + theSuite.addTest(common.unittest.makeSuite(StrlenOpenTestCase)) + theSuite.addTest(common.unittest.makeSuite(StrlenCloseTestCase)) + if common.heavy: + theSuite.addTest(common.unittest.makeSuite(Basic10DTestCase)) + # The 32 dimensions case takes forever to run!! + # theSuite.addTest(unittest.makeSuite(Basic32DTestCase)) + return theSuite + + +if __name__ == '__main__': + common.parse_argv(sys.argv) + common.print_versions() + common.unittest.main(defaultTest='suite') diff --git a/tables/tests/test_queries.py b/tables/tests/test_queries.py new file mode 100644 index 0000000..b9406a3 --- /dev/null +++ b/tables/tests/test_queries.py @@ -0,0 +1,1279 @@ +"""Test module for queries on datasets.""" + +import re +import sys +import warnings +import functools + +import numpy as np + +import tables as tb +from tables.tests import common + + +# Data parameters +# --------------- +row_period = 50 +"""Maximum number of unique rows before they start cycling.""" +md_shape = (2, 2) +"""Shape of multidimensional fields.""" + +_maxnvalue = row_period + np.prod(md_shape, dtype=tb.utils.SizeType) - 1 +_strlen = int(np.log10(_maxnvalue-1)) + 1 + +str_format = '%%0%dd' % _strlen +"""Format of string values.""" + +small_blocksizes = (300, 60, 20, 5) +# small_blocksizes = (512, 128, 32, 4) # for manual testing only +"""Sensible parameters for indexing with small blocksizes.""" + + +# Type information +# ---------------- +type_info = { + 'bool': (np.bool_, bool), + 'int8': (np.int8, int), + 'uint8': (np.uint8, int), + 'int16': (np.int16, int), + 'uint16': (np.uint16, int), + 'int32': (np.int32, int), + 'uint32': (np.uint32, int), + 'int64': (np.int64, int), + 'uint64': (np.uint64, int), + 'float32': (np.float32, float), + 'float64': (np.float64, float), + 'complex64': (np.complex64, complex), + 'complex128': (np.complex128, complex), + 'time32': (np.int32, int), + 'time64': (np.float64, float), + 'enum': (np.uint8, int), # just for these tests + 'string': ('S%s' % _strlen, np.string_), # just for these tests +} +"""NumPy and Numexpr type for each PyTables type that will be tested.""" + +# globals dict for eval() +func_info = {'log10': np.log10, 'log': np.log, 'exp': np.exp, + 'abs': np.abs, 'sqrt': np.sqrt, + 'sin': np.sin, 'cos': np.cos, 'tan': np.tan, + 'arcsin': np.arcsin, 'arccos': np.arccos, 'arctan': np.arctan} +"""functions and NumPy.ufunc() for each function that will be tested.""" + + +if hasattr(np, 'float16'): + type_info['float16'] = (np.float16, float) +# if hasattr(numpy, 'float96'): +# type_info['float96'] = (numpy.float96, float) +# if hasattr(numpy, 'float128'): +# type_info['float128'] = (numpy.float128, float) +# if hasattr(numpy, 'complex192'): +# type_info['complex192'] = (numpy.complex192, complex) +# if hasattr(numpy, 'complex256'): +# type_info['complex256'] = (numpy.complex256, complex) + +sctype_from_type = {type_: info[0] for (type_, info) in type_info.items()} +"""Maps PyTables types to NumPy scalar types.""" +nxtype_from_type = {type_: info[1] for (type_, info) in type_info.items()} +"""Maps PyTables types to Numexpr types.""" + +heavy_types = frozenset(['uint8', 'int16', 'uint16', 'float32', 'complex64']) +"""PyTables types to be tested only in heavy mode.""" + +enum = tb.Enum({'n%d' % i: i for i in range(_maxnvalue)}) +"""Enumerated type to be used in tests.""" + + +# Table description +# ----------------- +def append_columns(classdict, shape=()): + """Append a ``Col`` of each PyTables data type to the `classdict`. + + A column of a certain TYPE gets called ``c_TYPE``. The number of + added columns is returned. + + """ + heavy = common.heavy + for (itype, type_) in enumerate(sorted(type_info)): + if not heavy and type_ in heavy_types: + continue # skip heavy type in non-heavy mode + colpos = itype + 1 + colname = 'c_%s' % type_ + if type_ == 'enum': + base = tb.Atom.from_sctype(sctype_from_type[type_]) + col = tb.EnumCol(enum, enum(0), base, shape=shape, pos=colpos) + else: + sctype = sctype_from_type[type_] + dtype = np.dtype((sctype, shape)) + col = tb.Col.from_dtype(dtype, pos=colpos) + classdict[colname] = col + ncols = colpos + return ncols + + +def nested_description(classname, pos, shape=()): + """Return a nested column description with all PyTables data types. + + A column of a certain TYPE gets called ``c_TYPE``. The nested + column will be placed in the position indicated by `pos`. + + """ + classdict = {} + append_columns(classdict, shape=shape) + classdict['_v_pos'] = pos + return type(classname, (tb.IsDescription,), classdict) + + +def table_description(classname, nclassname, shape=()): + """Return a table description for testing queries. + + The description consists of all PyTables data types, both in the + top level and in the ``c_nested`` nested column. A column of a + certain TYPE gets called ``c_TYPE``. An extra integer column + ``c_extra`` is also provided. If a `shape` is given, it will be + used for all columns. Finally, an extra indexed column + ``c_idxextra`` is added as well in order to provide some basic + tests for multi-index queries. + + """ + classdict = {} + colpos = append_columns(classdict, shape) + + ndescr = nested_description(nclassname, colpos, shape=shape) + classdict['c_nested'] = ndescr + colpos += 1 + + extracol = tb.IntCol(shape=shape, pos=colpos) + classdict['c_extra'] = extracol + colpos += 1 + + idxextracol = tb.IntCol(shape=shape, pos=colpos) + classdict['c_idxextra'] = idxextracol + colpos += 1 + + return type(classname, (tb.IsDescription,), classdict) + + +TableDescription = table_description( + 'TableDescription', 'NestedDescription') +"""Unidimensional table description for testing queries.""" + +MDTableDescription = table_description( + 'MDTableDescription', 'MDNestedDescription', shape=md_shape) +"""Multidimensional table description for testing queries.""" + + +# Table data +# ---------- +table_data = {} +"""Cached table data for a given shape and number of rows.""" +# Data is cached because computing it row by row is quite slow. Hop! + + +def fill_table(table, shape, nrows): + """Fill the given `table` with `nrows` rows of data. + + Values in the i-th row (where 0 <= i < `row_period`) for a + multidimensional field with M elements span from i to i + M-1. For + subsequent rows, values repeat cyclically. + + The same goes for the ``c_extra`` column, but values range from + -`row_period`/2 to +`row_period`/2. + + """ + # Reuse already computed data if possible. + tdata = table_data.get((shape, nrows)) + if tdata is not None: + table.append(tdata) + table.flush() + return + + heavy = common.heavy + size = int(np.prod(shape, dtype=tb.utils.SizeType)) + + row, value = table.row, 0 + for nrow in range(nrows): + data = np.arange(value, value + size).reshape(shape) + for (type_, sctype) in sctype_from_type.items(): + if not heavy and type_ in heavy_types: + continue # skip heavy type in non-heavy mode + colname = 'c_%s' % type_ + ncolname = 'c_nested/%s' % colname + if type_ == 'bool': + coldata = data > (row_period // 2) + elif type_ == 'string': + sdata = [str_format % x for x in range(value, value + size)] + coldata = np.array(sdata, dtype=sctype).reshape(shape) + else: + coldata = np.asarray(data, dtype=sctype) + row[ncolname] = row[colname] = coldata + row['c_extra'] = data - (row_period // 2) + row['c_idxextra'] = data - (row_period // 2) + row.append() + value += 1 + if value == row_period: + value = 0 + table.flush() + + # Make computed data reusable. + tdata = table.read() + table_data[(shape, nrows)] = tdata + + +class SilentlySkipTest(common.unittest.SkipTest): + pass + + +# Base test cases +# --------------- +class BaseTableQueryTestCase(common.TempFileMixin, common.PyTablesTestCase): + """Base test case for querying tables. + + Sub-classes must define the following attributes: + + ``tableDescription`` + The description of the table to be created. + ``shape`` + The shape of data fields in the table. + ``nrows`` + The number of data rows to be generated for the table. + + Sub-classes may redefine the following attributes: + + ``indexed`` + Whether columns shall be indexed, if possible. Default is not + to index them. + ``optlevel`` + The level of optimisation of column indexes. Default is 0. + + """ + + indexed = False + optlevel = 0 + + colNotIndexable_re = re.compile(r"\bcan not be indexed\b") + condNotBoolean_re = re.compile(r"\bdoes not have a boolean type\b") + + def create_indexes(self, colname, ncolname, extracolname): + if not self.indexed: + return + try: + kind = self.kind + common.verbosePrint( + f"* Indexing ``{colname}`` columns. Type: {kind}.") + for acolname in [colname, ncolname, extracolname]: + acolumn = self.table.colinstances[acolname] + acolumn.create_index( + kind=self.kind, optlevel=self.optlevel, + _blocksizes=small_blocksizes, _testmode=True) + + except TypeError as te: + if self.colNotIndexable_re.search(str(te)): + raise SilentlySkipTest( + "Columns of this type can not be indexed.") + raise + except NotImplementedError: + raise SilentlySkipTest( + "Indexing columns of this type is not supported yet.") + + def setUp(self): + super().setUp() + self.table = self.h5file.create_table( + '/', 'test', self.tableDescription, expectedrows=self.nrows) + fill_table(self.table, self.shape, self.nrows) + + +class ScalarTableMixin: + tableDescription = TableDescription + shape = () + + +class MDTableMixin: + tableDescription = MDTableDescription + shape = md_shape + + +# Test cases on query data +# ------------------------ +operators = [ + None, '~', + '<', '<=', '==', '!=', '>=', '>', + ('<', '<='), ('>', '>=')] +"""Comparison operators to check with different types.""" +heavy_operators = frozenset(['~', '<=', '>=', '>', ('>', '>=')]) +"""Comparison operators to be tested only in heavy mode.""" +left_bound = row_period // 4 +"""Operand of left side operator in comparisons with operator pairs.""" +right_bound = row_period * 3 // 4 +"""Operand of right side operator in comparisons with operator pairs.""" +func_bound = 0.8 # must be <1 for trig functions to be able to fail +"""Operand of right side operator in comparisons with functions. """ +extra_conditions = [ + '', # uses one index + '& ((c_extra + 1) < 0)', # uses one index + '| (c_idxextra > 0)', # uses two indexes + '| ((c_idxextra > 0) | ((c_extra + 1) > 0))', # can't use indexes +] +"""Extra conditions to append to comparison conditions.""" + + +class TableDataTestCase(BaseTableQueryTestCase): + """Base test case for querying table data. + + Automatically created test method names have the format + ``test_XNNNN``, where ``NNNN`` is the zero-padded test number and + ``X`` indicates whether the test belongs to the light (``l``) or + heavy (``h``) set. + + """ + _testfmt_light = 'test_l%04d' + _testfmt_heavy = 'test_h%04d' + + +def create_test_method(type_, op, extracond, func=None): + sctype = sctype_from_type[type_] + + # Compute the value of bounds. + condvars = {'bound': right_bound, + 'lbound': left_bound, + 'rbound': right_bound, + 'func_bound': func_bound} + for (bname, bvalue) in condvars.items(): + if type_ == 'string': + bvalue = str_format % bvalue + bvalue = nxtype_from_type[type_](bvalue) + condvars[bname] = bvalue + + # Compute the name of columns. + colname = 'c_%s' % type_ + ncolname = 'c_nested/%s' % colname + + # Compute the query condition. + if not op: # as is + cond = colname + elif op == '~': # unary + cond = '~(%s)' % colname + elif op == '<' and func is None: # binary variable-constant + cond = '{} {} {}'.format(colname, op, repr(condvars['bound'])) + elif isinstance(op, tuple): # double binary variable-constant + cond = ('(lbound %s %s) & (%s %s rbound)' + % (op[0], colname, colname, op[1])) + elif func is not None: + cond = f'{func}({colname}) {op} func_bound' + else: # function or binary variable-variable + cond = f'{colname} {op} bound' + if extracond: + cond = f'({cond}) {extracond}' + + def ignore_skipped(oldmethod): + @functools.wraps(oldmethod) + def newmethod(self, *args, **kwargs): + self._verboseHeader() + try: + return oldmethod(self, *args, **kwargs) + except SilentlySkipTest as se: + if se.args: + msg = se.args[0] + else: + msg = "" + common.verbosePrint("\nSkipped test: %s" % msg) + finally: + common.verbosePrint('') # separator line between tests + return newmethod + + @ignore_skipped + def test_method(self): + common.verbosePrint("* Condition is ``%s``." % cond) + # Replace bitwise operators with their logical counterparts. + pycond = cond + for (ptop, pyop) in [('&', 'and'), ('|', 'or'), ('~', 'not')]: + pycond = pycond.replace(ptop, pyop) + pycond = compile(pycond, '', 'eval') + + table = self.table + self.create_indexes(colname, ncolname, 'c_idxextra') + + table_slice = dict(start=1, stop=table.nrows - 5, step=3) + rownos, fvalues = None, None + # Test that both simple and nested columns work as expected. + # Knowing how the table is filled, results must be the same. + for acolname in [colname, ncolname]: + # First the reference Python version. + pyrownos, pyfvalues, pyvars = [], [], condvars.copy() + for row in table.iterrows(**table_slice): + pyvars[colname] = row[acolname] + pyvars['c_extra'] = row['c_extra'] + pyvars['c_idxextra'] = row['c_idxextra'] + try: + with warnings.catch_warnings(): + warnings.filterwarnings( + 'ignore', + 'invalid value encountered in arc(cos|sin)', + RuntimeWarning) + isvalidrow = eval(pycond, func_info, pyvars) + except TypeError: + raise SilentlySkipTest( + "The Python type does not support the operation.") + if isvalidrow: + pyrownos.append(row.nrow) + pyfvalues.append(row[acolname]) + pyrownos = np.array(pyrownos) # row numbers already sorted + pyfvalues = np.array(pyfvalues, dtype=sctype) + pyfvalues.sort() + common.verbosePrint(f"* {len(pyrownos)} rows selected by Python " + f"from ``{acolname}``.") + if rownos is None: + rownos = pyrownos # initialise reference results + fvalues = pyfvalues + else: + self.assertTrue(np.all(pyrownos == rownos)) # check + self.assertTrue(np.all(pyfvalues == fvalues)) + + # Then the in-kernel or indexed version. + ptvars = condvars.copy() + ptvars[colname] = table.colinstances[acolname] + ptvars['c_extra'] = table.colinstances['c_extra'] + ptvars['c_idxextra'] = table.colinstances['c_idxextra'] + try: + isidxq = table.will_query_use_indexing(cond, ptvars) + # Query twice to trigger possible query result caching. + ptrownos = [table.get_where_list(cond, condvars, sort=True, + **table_slice) + for _ in range(2)] + ptfvalues = [ + table.read_where(cond, condvars, field=acolname, + **table_slice) + for _ in range(2) + ] + except TypeError as te: + if self.condNotBoolean_re.search(str(te)): + raise SilentlySkipTest("The condition is not boolean.") + raise + except NotImplementedError: + raise SilentlySkipTest( + "The PyTables type does not support the operation.") + for ptfvals in ptfvalues: # row numbers already sorted + ptfvals.sort() + common.verbosePrint(f"* {len(ptrownos[0])} rows selected by " + f"PyTables from ``{acolname}``", nonl=True) + common.verbosePrint(f"(indexing: {'yes' if isidxq else 'no'}).") + self.assertTrue(np.all(ptrownos[0] == rownos)) + self.assertTrue(np.all(ptfvalues[0] == fvalues)) + # The following test possible caching of query results. + self.assertTrue(np.all(ptrownos[0] == ptrownos[1])) + self.assertTrue(np.all(ptfvalues[0] == ptfvalues[1])) + + test_method.__doc__ = "Testing ``%s``." % cond + return test_method + + +def add_test_method(type_, op, extracond='', func=None): + global testn + # Decide to which set the test belongs. + heavy = type_ in heavy_types or op in heavy_operators + if heavy: + testfmt = TableDataTestCase._testfmt_heavy + else: + testfmt = TableDataTestCase._testfmt_light + tmethod = create_test_method(type_, op, extracond, func) + # The test number is appended to the docstring to help + # identify failing methods in non-verbose mode. + tmethod.__name__ = testfmt % testn + tmethod.__doc__ += testfmt % testn + setattr(TableDataTestCase, tmethod.__name__, tmethod) + testn += 1 + + +# Create individual tests. You may restrict which tests are generated +# by replacing the sequences in the ``for`` statements. For instance: +testn = 0 +for type_ in type_info: # for type_ in ['string']: + for op in operators: # for op in ['!=']: + for extracond in extra_conditions: # for extracond in ['']: + add_test_method(type_, op, extracond) + +for type_ in ['float32', 'float64']: + for func in func_info: # i for func in ['log10']: + for op in operators: + add_test_method(type_, op, func=func) + +# Base classes for non-indexed queries. +NX_BLOCK_SIZE1 = 128 # from ``interpreter.c`` in Numexpr +NX_BLOCK_SIZE2 = 8 # from ``interpreter.c`` in Numexpr + + +class SmallNITableMixin: + nrows = row_period * 2 + assert NX_BLOCK_SIZE2 < nrows < NX_BLOCK_SIZE1 + assert nrows % NX_BLOCK_SIZE2 != 0 # to have some residual rows + + +class BigNITableMixin: + nrows = row_period * 3 + assert nrows > NX_BLOCK_SIZE1 + NX_BLOCK_SIZE2 + assert nrows % NX_BLOCK_SIZE1 != 0 + assert nrows % NX_BLOCK_SIZE2 != 0 # to have some residual rows + + +# Parameters for non-indexed queries. +table_sizes = ['Small', 'Big'] +heavy_table_sizes = frozenset(['Big']) +table_ndims = ['Scalar'] # to enable multidimensional testing, include 'MD' + +# Non-indexed queries: ``[SB][SM]TDTestCase``, where: +# +# 1. S is for small and B is for big size table. +# Sizes are listed in `table_sizes`. +# 2. S is for scalar and M for multidimensional columns. +# Dimensionalities are listed in `table_ndims`. + + +def niclassdata(): + for size in table_sizes: + heavy = size in heavy_table_sizes + for ndim in table_ndims: + classname = '{}{}TDTestCase'.format(size[0], ndim[0]) + cbasenames = ('%sNITableMixin' % size, '%sTableMixin' % ndim, + 'TableDataTestCase') + classdict = dict(heavy=heavy) + yield (classname, cbasenames, classdict) + + +# Base classes for the different type index. +class UltraLightITableMixin: + kind = "ultralight" + + +class LightITableMixin: + kind = "light" + + +class MediumITableMixin: + kind = "medium" + + +class FullITableMixin: + kind = "full" + +# Base classes for indexed queries. + + +class SmallSTableMixin: + nrows = 50 + + +class MediumSTableMixin: + nrows = 100 + + +class BigSTableMixin: + nrows = 500 + + +# Parameters for indexed queries. +ckinds = ['UltraLight', 'Light', 'Medium', 'Full'] +itable_sizes = ['Small', 'Medium', 'Big'] +heavy_itable_sizes = frozenset(['Medium', 'Big']) +itable_optvalues = [0, 1, 3, 7, 9] +heavy_itable_optvalues = frozenset([0, 1, 7, 9]) + +# Indexed queries: ``[SMB]I[ulmf]O[01379]TDTestCase``, where: +# +# 1. S is for small, M for medium and B for big size table. +# Sizes are listed in `itable_sizes`. +# 2. U is for 'ultraLight', L for 'light', M for 'medium', F for 'Full' indexes +# Index types are listed in `ckinds`. +# 3. 0 to 9 is the desired index optimization level. +# Optimizations are listed in `itable_optvalues`. + + +def iclassdata(): + for ckind in ckinds: + for size in itable_sizes: + for optlevel in itable_optvalues: + heavy = (optlevel in heavy_itable_optvalues + or size in heavy_itable_sizes) + classname = '%sI%sO%dTDTestCase' % ( + size[0], ckind[0], optlevel) + cbasenames = ('%sSTableMixin' % size, + '%sITableMixin' % ckind, + 'ScalarTableMixin', + 'TableDataTestCase') + classdict = dict(heavy=heavy, optlevel=optlevel, indexed=True) + yield (classname, cbasenames, classdict) + + +# Create test classes. +for cdatafunc in [niclassdata, iclassdata]: + for (cname, cbasenames, cdict) in cdatafunc(): + cbases = tuple(eval(cbase) for cbase in cbasenames) + class_ = type(cname, cbases, cdict) + exec('%s = class_' % cname) + + +# Test cases on query usage +# ------------------------- +class BaseTableUsageTestCase(BaseTableQueryTestCase): + nrows = row_period + + +_gvar = None +"""Use this when a global variable is needed.""" + + +class ScalarTableUsageTestCase(ScalarTableMixin, BaseTableUsageTestCase): + """Test case for query usage on scalar tables. + + This also tests for most usage errors and situations. + + """ + + def test_empty_condition(self): + """Using an empty condition.""" + + self.assertRaises(SyntaxError, self.table.where, '') + + def test_syntax_error(self): + """Using a condition with a syntax error.""" + + self.assertRaises(SyntaxError, self.table.where, 'foo bar') + + def test_unsupported_object(self): + """Using a condition with an unsupported object.""" + + self.assertRaises(TypeError, self.table.where, '[]') + self.assertRaises(TypeError, self.table.where, 'obj', {'obj': {}}) + self.assertRaises(TypeError, self.table.where, 'c_bool < []') + + def test_unsupported_syntax(self): + """Using a condition with unsupported syntax.""" + + self.assertRaises(TypeError, self.table.where, 'c_bool[0]') + self.assertRaises(TypeError, self.table.where, 'c_bool()') + self.assertRaises(NameError, self.table.where, 'c_bool.__init__') + + def test_no_column(self): + """Using a condition with no participating columns.""" + + self.assertRaises(ValueError, self.table.where, 'True') + + def test_foreign_column(self): + """Using a condition with a column from other table.""" + + table2 = self.h5file.create_table('/', 'other', self.tableDescription) + self.assertRaises(ValueError, self.table.where, + 'c_int32_a + c_int32_b > 0', + {'c_int32_a': self.table.cols.c_int32, + 'c_int32_b': table2.cols.c_int32}) + + def test_unsupported_op(self): + """Using a condition with unsupported operations on types.""" + + NIE = NotImplementedError + self.assertRaises(NIE, self.table.where, 'c_complex128 > 0j') + self.assertRaises(NIE, self.table.where, 'c_string + b"a" > b"abc"') + + def test_not_boolean(self): + """Using a non-boolean condition.""" + + self.assertRaises(TypeError, self.table.where, 'c_int32') + + def test_nested_col(self): + """Using a condition with nested columns.""" + + self.assertRaises(TypeError, self.table.where, 'c_nested') + + def test_implicit_col(self): + """Using implicit column names in conditions.""" + + # If implicit columns didn't work, a ``NameError`` would be raised. + self.assertRaises(TypeError, self.table.where, 'c_int32') + # If overriding didn't work, no exception would be raised. + self.assertRaises(TypeError, self.table.where, + 'c_bool', {'c_bool': self.table.cols.c_int32}) + # External variables do not override implicit columns. + + def where_with_locals(): + c_int32 = self.table.cols.c_bool # this wouldn't cause an error + self.assertIsNotNone(c_int32) + self.table.where('c_int32') + self.assertRaises(TypeError, where_with_locals) + + def test_condition_vars(self): + """Using condition variables in conditions.""" + + # If condition variables didn't work, a ``NameError`` would be raised. + self.assertRaises(NotImplementedError, self.table.where, + 'c_string > bound', {'bound': 0}) + + def where_with_locals(): + bound = 'foo' # this wouldn't cause an error + # silence pyflakes warnings + self.assertIsInstance(bound, str) + self.table.where('c_string > bound', {'bound': 0}) + self.assertRaises(NotImplementedError, where_with_locals) + + def where_with_globals(): + global _gvar + _gvar = 'foo' # this wouldn't cause an error + # silence pyflakes warnings + self.assertIsInstance(_gvar, str) + try: + self.table.where('c_string > _gvar', {'_gvar': 0}) + finally: + del _gvar # to keep global namespace clean + self.assertRaises(NotImplementedError, where_with_globals) + + def test_scopes(self): + """Looking up different scopes for variables.""" + + # Make sure the variable is not implicit. + self.assertRaises(NameError, self.table.where, 'col') + + # First scope: dictionary of condition variables. + self.assertRaises(TypeError, self.table.where, + 'col', {'col': self.table.cols.c_int32}) + + # Second scope: local variables. + def where_whith_locals(): + col = self.table.cols.c_int32 + self.assertIsNotNone(col) + self.table.where('col') + self.assertRaises(TypeError, where_whith_locals) + + # Third scope: global variables. + def where_with_globals(): + global _gvar + _gvar = self.table.cols.c_int32 + # silence pyflakes warnings + self.assertIsNotNone(_gvar) + try: + self.table.where('_gvar') + finally: + del _gvar # to keep global namespace clean + + self.assertRaises(TypeError, where_with_globals) + + +class MDTableUsageTestCase(MDTableMixin, BaseTableUsageTestCase): + """Test case for query usage on multidimensional tables.""" + + def test(self): + """Using a condition on a multidimensional table.""" + + # Easy: queries on multidimensional tables are not implemented yet! + self.assertRaises(NotImplementedError, self.table.where, 'c_bool') + + +class IndexedTableUsage(ScalarTableMixin, BaseTableUsageTestCase): + """Test case for query usage on indexed tables. + + Indexing could be used in more cases, but it is expected to kick in + at least in the cases tested here. + + """ + nrows = 50 + indexed = True + + def setUp(self): + super().setUp() + self.table.cols.c_bool.create_index(_blocksizes=small_blocksizes) + self.table.cols.c_int32.create_index(_blocksizes=small_blocksizes) + self.will_query_use_indexing = self.table.will_query_use_indexing + self.compileCondition = self.table._compile_condition + self.requiredExprVars = self.table._required_expr_vars + usable_idxs = set() + for expr in self.idx_expr: + idxvar = expr[0] + if idxvar not in usable_idxs: + usable_idxs.add(idxvar) + self.usable_idxs = frozenset(usable_idxs) + + def test(self): + for condition in self.conditions: + c_usable_idxs = self.will_query_use_indexing(condition, {}) + self.assertEqual(c_usable_idxs, self.usable_idxs, + f"\nQuery with condition: ``{condition}``\n" + f"Computed usable indexes are: " + f"``{c_usable_idxs}``\nand should be: " + f"``{self.usable_idxs}``") + condvars = self.requiredExprVars(condition, None) + compiled = self.compileCondition(condition, condvars) + c_idx_expr = compiled.index_expressions + self.assertEqual(c_idx_expr, self.idx_expr, + f"\nWrong index expression in condition:\n" + f"``{condition}``\nCompiled index expression is:" + f"\n``{c_idx_expr}``\nand should be:\n" + f"``{self.idx_expr}``") + c_str_expr = compiled.string_expression + self.assertEqual(c_str_expr, self.str_expr, + f"\nWrong index operations in condition:\n" + f"``{condition}``\nComputed index operations are:" + f"\n``{c_str_expr}``\nand should be:\n" + f"``{self.str_expr}``") + common.verbosePrint( + f"* Query with condition ``{condition}`` will use variables " + f"``{compiled.index_variables}`` for indexing.") + + +class IndexedTableUsage1(IndexedTableUsage): + conditions = [ + '(c_int32 > 0)', + '(c_int32 > 0) & (c_extra > 0)', + '(c_int32 > 0) & ((~c_bool) | (c_extra > 0))', + '(c_int32 > 0) & ((c_extra < 3) & (c_extra > 0))', + ] + idx_expr = [('c_int32', ('gt',), (0,))] + str_expr = 'e0' + + +class IndexedTableUsage2(IndexedTableUsage): + conditions = [ + '(c_int32 > 0) & (c_int32 < 5)', + '(c_int32 > 0) & (c_int32 < 5) & (c_extra > 0)', + '(c_int32 > 0) & (c_int32 < 5) & ((c_bool == True) | (c_extra > 0))', + '(c_int32 > 0) & (c_int32 < 5) & ((c_extra > 0) | (c_bool == True))', + ] + idx_expr = [('c_int32', ('gt', 'lt'), (0, 5))] + str_expr = 'e0' + + +class IndexedTableUsage3(IndexedTableUsage): + conditions = [ + '(c_bool == True)', + '(c_bool == True) & (c_extra > 0)', + '(c_extra > 0) & (c_bool == True)', + '((c_extra > 0) & (c_extra < 4)) & (c_bool == True)', + '(c_bool == True) & ((c_extra > 0) & (c_extra < 4))', + ] + idx_expr = [('c_bool', ('eq',), (True,))] + str_expr = 'e0' + + +class IndexedTableUsage4(IndexedTableUsage): + conditions = [ + '((c_int32 > 0) & (c_bool == True)) & (c_extra > 0)', + '((c_int32 > 0) & (c_bool == True)) & ((c_extra > 0)' + + ' & (c_extra < 4))', + ] + idx_expr = [('c_int32', ('gt',), (0,)), + ('c_bool', ('eq',), (True,)), + ] + str_expr = '(e0 & e1)' + + +class IndexedTableUsage5(IndexedTableUsage): + conditions = [ + '(c_int32 >= 1) & (c_int32 < 2) & (c_bool == True)', + '(c_int32 >= 1) & (c_int32 < 2) & (c_bool == True)' + + ' & (c_extra > 0)', + ] + idx_expr = [('c_int32', ('ge', 'lt'), (1, 2)), + ('c_bool', ('eq',), (True,)), + ] + str_expr = '(e0 & e1)' + + +class IndexedTableUsage6(IndexedTableUsage): + conditions = [ + '(c_int32 >= 1) & (c_int32 < 2) & (c_int32 > 0) & (c_int32 < 5)', + '(c_int32 >= 1) & (c_int32 < 2) & (c_int32 > 0) & (c_int32 < 5)' + + ' & (c_extra > 0)', + ] + idx_expr = [('c_int32', ('ge', 'lt'), (1, 2)), + ('c_int32', ('gt',), (0,)), + ('c_int32', ('lt',), (5,)), + ] + str_expr = '((e0 & e1) & e2)' + + +class IndexedTableUsage7(IndexedTableUsage): + conditions = [ + '(c_int32 >= 1) & (c_int32 < 2) & ((c_int32 > 0) & (c_int32 < 5))', + '((c_int32 >= 1) & (c_int32 < 2)) & ((c_int32 > 0) & (c_int32 < 5))', + '((c_int32 >= 1) & (c_int32 < 2)) & ((c_int32 > 0) & (c_int32 < 5))' + + ' & (c_extra > 0)', + ] + idx_expr = [('c_int32', ('ge', 'lt'), (1, 2)), + ('c_int32', ('gt', 'lt'), (0, 5)), + ] + str_expr = '(e0 & e1)' + + +class IndexedTableUsage8(IndexedTableUsage): + conditions = [ + '(c_extra > 0) & ((c_int32 > 0) & (c_int32 < 5))', + ] + idx_expr = [('c_int32', ('gt', 'lt'), (0, 5)), + ] + str_expr = 'e0' + + +class IndexedTableUsage9(IndexedTableUsage): + conditions = [ + '(c_extra > 0) & (c_int32 > 0) & (c_int32 < 5)', + '((c_extra > 0) & (c_int32 > 0)) & (c_int32 < 5)', + '(c_extra > 0) & (c_int32 > 0) & (c_int32 < 5) & (c_extra > 3)', + ] + idx_expr = [('c_int32', ('gt',), (0,)), + ('c_int32', ('lt',), (5,))] + str_expr = '(e0 & e1)' + + +class IndexedTableUsage10(IndexedTableUsage): + conditions = [ + '(c_int32 < 5) & (c_extra > 0) & (c_bool == True)', + '(c_int32 < 5) & (c_extra > 2) & c_bool', + '(c_int32 < 5) & (c_bool == True) & (c_extra > 0) & (c_extra < 4)', + '(c_int32 < 5) & (c_extra > 0) & (c_bool == True) & (c_extra < 4)', + ] + idx_expr = [('c_int32', ('lt',), (5,)), + ('c_bool', ('eq',), (True,))] + str_expr = '(e0 & e1)' + + +class IndexedTableUsage11(IndexedTableUsage): + """Complex operations are not eligible for indexing.""" + + conditions = [ + 'sin(c_int32) > 0', + '(c_int32 * 2.4) > 0', + '(c_int32 + c_int32) > 0', + 'c_int32**2 > 0', + ] + idx_expr = [] + str_expr = '' + + +class IndexedTableUsage12(IndexedTableUsage): + conditions = [ + '~c_bool', + '~(c_bool)', + '~c_bool & (c_extra > 0)', + '~(c_bool) & (c_extra > 0)', + ] + idx_expr = [('c_bool', ('eq',), (False,))] + str_expr = 'e0' + + +class IndexedTableUsage13(IndexedTableUsage): + conditions = [ + '~(c_bool == True)', + '~((c_bool == True))', + '~(c_bool == True) & (c_extra > 0)', + '~((c_bool == True)) & (c_extra > 0)', + ] + idx_expr = [('c_bool', ('eq',), (False,))] + str_expr = 'e0' + + +class IndexedTableUsage14(IndexedTableUsage): + conditions = [ + '~(c_int32 > 0)', + '~((c_int32 > 0)) & (c_extra > 0)', + '~(c_int32 > 0) & ((~c_bool) | (c_extra > 0))', + '~(c_int32 > 0) & ((c_extra < 3) & (c_extra > 0))', + ] + idx_expr = [('c_int32', ('le',), (0,))] + str_expr = 'e0' + + +class IndexedTableUsage15(IndexedTableUsage): + conditions = [ + '(~(c_int32 > 0) | ~c_bool)', + '(~(c_int32 > 0) | ~(c_bool)) & (c_extra > 0)', + '(~(c_int32 > 0) | ~(c_bool == True)) & ((c_extra > 0)' + + ' & (c_extra < 4))', + ] + idx_expr = [('c_int32', ('le',), (0,)), + ('c_bool', ('eq',), (False,)), + ] + str_expr = '(e0 | e1)' + + +class IndexedTableUsage16(IndexedTableUsage): + conditions = [ + '(~(c_int32 > 0) & ~(c_int32 < 2))', + '(~(c_int32 > 0) & ~(c_int32 < 2)) & (c_extra > 0)', + '(~(c_int32 > 0) & ~(c_int32 < 2)) & ((c_extra > 0)' + + ' & (c_extra < 4))', + ] + idx_expr = [('c_int32', ('le',), (0,)), + ('c_int32', ('ge',), (2,)), + ] + str_expr = '(e0 & e1)' + + +class IndexedTableUsage17(IndexedTableUsage): + conditions = [ + '(~(c_int32 > 0) & ~(c_int32 < 2))', + '(~(c_int32 > 0) & ~(c_int32 < 2)) & (c_extra > 0)', + '(~(c_int32 > 0) & ~(c_int32 < 2)) & ((c_extra > 0)' + + ' & (c_extra < 4))', + ] + idx_expr = [('c_int32', ('le',), (0,)), + ('c_int32', ('ge',), (2,)), + ] + str_expr = '(e0 & e1)' + +# Negations of complex conditions are not supported yet + + +class IndexedTableUsage18(IndexedTableUsage): + conditions = [ + '~((c_int32 > 0) & (c_bool))', + '~((c_int32 > 0) & (c_bool)) & (c_extra > 0)', + '~((c_int32 > 0) & (c_bool)) & ((c_extra > 0)' + + ' & (c_extra < 4))', + ] + idx_expr = [] + str_expr = '' + + +class IndexedTableUsage19(IndexedTableUsage): + conditions = [ + '~((c_int32 > 0) & (c_bool)) & ((c_bool == False)' + + ' & (c_extra < 4))', + ] + idx_expr = [('c_bool', ('eq',), (False,)), + ] + str_expr = 'e0' + + +class IndexedTableUsage20(IndexedTableUsage): + conditions = [ + '((c_int32 > 0) & ~(c_bool))', + '((c_int32 > 0) & ~(c_bool)) & (c_extra > 0)', + '((c_int32 > 0) & ~(c_bool == True)) & ((c_extra > 0) & (c_extra < 4))' + ] + idx_expr = [ + ('c_int32', ('gt',), (0,)), + ('c_bool', ('eq',), (False,)), + ] + str_expr = '(e0 & e1)' + + +class IndexedTableUsage21(IndexedTableUsage): + conditions = [ + '(~(c_int32 > 0) & (c_bool))', + '(~(c_int32 > 0) & (c_bool)) & (c_extra > 0)', + '(~(c_int32 > 0) & (c_bool == True)) & ((c_extra > 0)' + + ' & (c_extra < 4))', + ] + idx_expr = [('c_int32', ('le',), (0,)), + ('c_bool', ('eq',), (True,)), + ] + str_expr = '(e0 & e1)' + + +class IndexedTableUsage22(IndexedTableUsage): + conditions = [ + '~((c_int32 >= 1) & (c_int32 < 2)) & ~(c_bool == True)', + '~(c_bool == True) & (c_extra > 0)', + '~((c_int32 >= 1) & (c_int32 < 2)) & (~(c_bool == True)' + + ' & (c_extra > 0))', + ] + idx_expr = [('c_bool', ('eq',), (False,)), + ] + str_expr = 'e0' + + +class IndexedTableUsage23(IndexedTableUsage): + conditions = [ + 'c_int32 != 1', + 'c_bool != False', + '~(c_int32 != 1)', + '~(c_bool != False)', + '(c_int32 != 1) & (c_extra != 2)', + ] + idx_expr = [] + str_expr = '' + + +class IndexedTableUsage24(IndexedTableUsage): + conditions = [ + 'c_bool', + 'c_bool == True', + 'True == c_bool', + '~(~c_bool)', + '~~c_bool', + '~~~~c_bool', + '~(~c_bool) & (c_extra != 2)', + ] + idx_expr = [('c_bool', ('eq',), (True,)), + ] + str_expr = 'e0' + + +class IndexedTableUsage25(IndexedTableUsage): + conditions = [ + '~c_bool', + 'c_bool == False', + 'False == c_bool', + '~(c_bool)', + '~((c_bool))', + '~~~c_bool', + '~~(~c_bool) & (c_extra != 2)', + ] + idx_expr = [ + ('c_bool', ('eq',), (False,)), + ] + str_expr = 'e0' + + +class IndexedTableUsage26(IndexedTableUsage): + conditions = [ + 'c_bool != True', + 'True != c_bool', + 'c_bool != False', + 'False != c_bool', + ] + idx_expr = [] + str_expr = '' + + +class IndexedTableUsage27(IndexedTableUsage): + conditions = [ + '(c_int32 == 3) | c_bool | (c_int32 == 5)', + '(((c_int32 == 3) | (c_bool == True)) | (c_int32 == 5))' + + ' & (c_extra > 0)', + ] + idx_expr = [ + ('c_int32', ('eq',), (3,)), + ('c_bool', ('eq',), (True,)), + ('c_int32', ('eq',), (5,)), + ] + str_expr = '((e0 | e1) | e2)' + + +class IndexedTableUsage28(IndexedTableUsage): + conditions = [ + '((c_int32 == 3) | c_bool) & (c_int32 == 5)', + '(((c_int32 == 3) | (c_bool == True)) & (c_int32 == 5))' + + ' & (c_extra > 0)', + ] + idx_expr = [ + ('c_int32', ('eq',), (3,)), + ('c_bool', ('eq',), (True,)), + ('c_int32', ('eq',), (5,)), + ] + str_expr = '((e0 | e1) & e2)' + + +class IndexedTableUsage29(IndexedTableUsage): + conditions = [ + '(c_int32 == 3) | ((c_int32 == 4) & (c_int32 == 5))', + '((c_int32 == 3) | ((c_int32 == 4) & (c_int32 == 5)))' + + ' & (c_extra > 0)', + ] + idx_expr = [ + ('c_int32', ('eq',), (4,)), + ('c_int32', ('eq',), (5,)), + ('c_int32', ('eq',), (3,)), + ] + str_expr = '((e0 & e1) | e2)' + + +class IndexedTableUsage30(IndexedTableUsage): + conditions = [ + '((c_int32 == 3) | (c_int32 == 4)) & (c_int32 == 5)', + '((c_int32 == 3) | (c_int32 == 4)) & (c_int32 == 5)' + + ' & (c_extra > 0)', + ] + idx_expr = [ + ('c_int32', ('eq',), (3,)), + ('c_int32', ('eq',), (4,)), + ('c_int32', ('eq',), (5,)), + ] + str_expr = '((e0 | e1) & e2)' + + +class IndexedTableUsage31(IndexedTableUsage): + conditions = [ + '(c_extra > 0) & ((c_extra < 4) & (c_bool == True))', + '(c_extra > 0) & ((c_bool == True) & (c_extra < 5))', + '((c_int32 > 0) | (c_extra > 0)) & (c_bool == True)', + ] + idx_expr = [ + ('c_bool', ('eq',), (True,)), + ] + str_expr = 'e0' + + +class IndexedTableUsage32(IndexedTableUsage): + conditions = [ + '(c_int32 < 5) & (c_extra > 0) & (c_bool == True) | (c_extra < 4)', + ] + idx_expr = [] + str_expr = '' + + +# Main part +# --------- +def suite(): + """Return a test suite consisting of all the test cases in the module.""" + + testSuite = common.unittest.TestSuite() + + cdatafuncs = [niclassdata] # non-indexing data tests + cdatafuncs.append(iclassdata) # indexing data tests + + heavy = common.heavy + # Choose which tests to run in classes with autogenerated tests. + if heavy: + autoprefix = 'test' # all tests + else: + autoprefix = 'test_l' # only light tests + + niter = 1 + for i in range(niter): + # Tests on query data. + for cdatafunc in cdatafuncs: + for cdata in cdatafunc(): + class_ = eval(cdata[0]) + if heavy or not class_.heavy: + suite_ = common.unittest.makeSuite(class_, + prefix=autoprefix) + testSuite.addTest(suite_) + # Tests on query usage. + testSuite.addTest(common.unittest.makeSuite(ScalarTableUsageTestCase)) + testSuite.addTest(common.unittest.makeSuite(MDTableUsageTestCase)) + testSuite.addTest(common.unittest.makeSuite(IndexedTableUsage1)) + testSuite.addTest(common.unittest.makeSuite(IndexedTableUsage2)) + testSuite.addTest(common.unittest.makeSuite(IndexedTableUsage3)) + testSuite.addTest(common.unittest.makeSuite(IndexedTableUsage4)) + testSuite.addTest(common.unittest.makeSuite(IndexedTableUsage5)) + testSuite.addTest(common.unittest.makeSuite(IndexedTableUsage6)) + testSuite.addTest(common.unittest.makeSuite(IndexedTableUsage7)) + testSuite.addTest(common.unittest.makeSuite(IndexedTableUsage8)) + testSuite.addTest(common.unittest.makeSuite(IndexedTableUsage9)) + testSuite.addTest(common.unittest.makeSuite(IndexedTableUsage10)) + testSuite.addTest(common.unittest.makeSuite(IndexedTableUsage11)) + testSuite.addTest(common.unittest.makeSuite(IndexedTableUsage12)) + testSuite.addTest(common.unittest.makeSuite(IndexedTableUsage13)) + testSuite.addTest(common.unittest.makeSuite(IndexedTableUsage14)) + testSuite.addTest(common.unittest.makeSuite(IndexedTableUsage15)) + testSuite.addTest(common.unittest.makeSuite(IndexedTableUsage16)) + testSuite.addTest(common.unittest.makeSuite(IndexedTableUsage17)) + testSuite.addTest(common.unittest.makeSuite(IndexedTableUsage18)) + testSuite.addTest(common.unittest.makeSuite(IndexedTableUsage19)) + testSuite.addTest(common.unittest.makeSuite(IndexedTableUsage20)) + testSuite.addTest(common.unittest.makeSuite(IndexedTableUsage21)) + testSuite.addTest(common.unittest.makeSuite(IndexedTableUsage22)) + testSuite.addTest(common.unittest.makeSuite(IndexedTableUsage23)) + testSuite.addTest(common.unittest.makeSuite(IndexedTableUsage24)) + testSuite.addTest(common.unittest.makeSuite(IndexedTableUsage25)) + testSuite.addTest(common.unittest.makeSuite(IndexedTableUsage26)) + testSuite.addTest(common.unittest.makeSuite(IndexedTableUsage27)) + testSuite.addTest(common.unittest.makeSuite(IndexedTableUsage28)) + testSuite.addTest(common.unittest.makeSuite(IndexedTableUsage29)) + testSuite.addTest(common.unittest.makeSuite(IndexedTableUsage30)) + testSuite.addTest(common.unittest.makeSuite(IndexedTableUsage31)) + testSuite.addTest(common.unittest.makeSuite(IndexedTableUsage32)) + + return testSuite + + +if __name__ == '__main__': + common.parse_argv(sys.argv) + common.print_versions() + common.unittest.main(defaultTest='suite') diff --git a/tables/tests/test_ref_array1.mat b/tables/tests/test_ref_array1.mat new file mode 100644 index 0000000..d449504 Binary files /dev/null and b/tables/tests/test_ref_array1.mat differ diff --git a/tables/tests/test_ref_array2.mat b/tables/tests/test_ref_array2.mat new file mode 100644 index 0000000..0754e54 Binary files /dev/null and b/tables/tests/test_ref_array2.mat differ diff --git a/tables/tests/test_suite.py b/tables/tests/test_suite.py new file mode 100644 index 0000000..83dc01f --- /dev/null +++ b/tables/tests/test_suite.py @@ -0,0 +1,95 @@ +"""Test suite consisting of all testcases.""" + +import sys + +from tables.tests import common + + +def suite(): + test_modules = [ + 'tables.tests.test_attributes', + 'tables.tests.test_basics', + 'tables.tests.test_create', + 'tables.tests.test_backcompat', + 'tables.tests.test_types', + 'tables.tests.test_lists', + 'tables.tests.test_tables', + 'tables.tests.test_tablesMD', + 'tables.tests.test_array', + 'tables.tests.test_earray', + 'tables.tests.test_carray', + 'tables.tests.test_vlarray', + 'tables.tests.test_tree', + 'tables.tests.test_timetype', + 'tables.tests.test_do_undo', + 'tables.tests.test_enum', + 'tables.tests.test_nestedtypes', + 'tables.tests.test_hdf5compat', + 'tables.tests.test_numpy', + 'tables.tests.test_queries', + 'tables.tests.test_expression', + 'tables.tests.test_links', + 'tables.tests.test_indexes', + 'tables.tests.test_indexvalues', + 'tables.tests.test_index_backcompat', + 'tables.tests.test_aux', + 'tables.tests.test_utils', + # Sub-packages + 'tables.nodes.tests.test_filenode', + ] + + # print('-=' * 38) + + # The test for garbage must be run *in the last place*. + # Else, it is not as useful. + test_modules.append('tables.tests.test_garbage') + + alltests = common.unittest.TestSuite() + if common.show_memory: + # Add a memory report at the beginning + alltests.addTest(common.unittest.makeSuite(common.ShowMemTime)) + for name in test_modules: + # Unexpectedly, the following code doesn't seem to work anymore + # in python 3 + # exec('from %s import suite as test_suite' % name) + __import__(name) + test_suite = sys.modules[name].suite + + alltests.addTest(test_suite()) + if common.show_memory: + # Add a memory report after each test module + alltests.addTest(common.unittest.makeSuite(common.ShowMemTime)) + return alltests + + +def test(verbose=False, heavy=False): + """Run all the tests in the test suite. + + If *verbose* is set, the test suite will emit messages with full + verbosity (not recommended unless you are looking into a certain + problem). + + If *heavy* is set, the test suite will be run in *heavy* mode (you + should be careful with this because it can take a lot of time and + resources from your computer). + + Return 0 (os.EX_OK) if all tests pass, 1 in case of failure + + """ + + common.print_versions() + common.print_heavy(heavy) + + # What a context this is! + # oldverbose, common.verbose = common.verbose, verbose + oldheavy, common.heavy = common.heavy, heavy + try: + result = common.unittest.TextTestRunner( + verbosity=1 + int(verbose)).run(suite()) + if result.wasSuccessful(): + return 0 + else: + return 1 + finally: + # common.verbose = oldverbose + common.heavy = oldheavy # there are pretty young heavies, too ;) diff --git a/tables/tests/test_szip.h5 b/tables/tests/test_szip.h5 new file mode 100644 index 0000000..892fa4d Binary files /dev/null and b/tables/tests/test_szip.h5 differ diff --git a/tables/tests/test_tables.py b/tables/tests/test_tables.py new file mode 100644 index 0000000..b929dc9 --- /dev/null +++ b/tables/tests/test_tables.py @@ -0,0 +1,6647 @@ +import itertools +import sys +import tempfile +import struct +import platform +from pathlib import Path + +import numpy as np + +import tables as tb +from tables.tests import common + + +# To know whether the interpreter is 32 or 64 bit +def is_python_64bit(): + return struct.calcsize("P") == 8 + + +# To know whether the os platform is 32 or 64 bit +def is_os_64bit(): + return platform.machine().endswith('64') + + +# Test Record class +class Record(tb.IsDescription): + var1 = tb.StringCol(itemsize=4, dflt=b"abcd", pos=0) # 4-character String + var2 = tb.IntCol(dflt=1, pos=1) # integer + var3 = tb.Int16Col(dflt=2, pos=2) # short integer + var4 = tb.Float64Col(dflt=3.1, pos=3) # double (double-precision) + var5 = tb.Float32Col(dflt=4.2, pos=4) # float (single-precision) + var6 = tb.UInt16Col(dflt=5, pos=5) # unsigned short integer + var7 = tb.StringCol(itemsize=1, dflt=b"e", pos=6) # 1-character String + var8 = tb.BoolCol(dflt=True, pos=7) # boolean + var9 = tb.ComplexCol( + itemsize=8, dflt=(0.+1.j), pos=8) # Complex single precision + var10 = tb.ComplexCol( + itemsize=16, dflt=(1.-0.j), pos=9) # Complex double precision + if hasattr(tb, 'Float16Col'): + var11 = tb.Float16Col(dflt=6.4) # float (half-precision) + if hasattr(tb, 'Float96Col'): + var12 = tb.Float96Col(dflt=6.4) # float (extended precision) + if hasattr(tb, 'Float128Col'): + var13 = tb.Float128Col(dflt=6.4) # float (extended precision) + if hasattr(tb, 'Complex192Col'): + var14 = tb.ComplexCol( + itemsize=24, dflt=(1.-0.j)) # Complex double (extended precision) + if hasattr(tb, 'Complex256Col'): + var15 = tb.ComplexCol( + itemsize=32, dflt=(1.-0.j)) # Complex double (extended precision) + + +# Dictionary definition +RecordDescriptionDict = { + 'var1': tb.StringCol(itemsize=4, dflt=b"abcd", pos=0), # 4-char String + 'var2': tb.IntCol(dflt=1, pos=1), # integer + 'var3': tb.Int16Col(dflt=2, pos=2), # short integer + 'var4': tb.Float64Col(dflt=3.1, pos=3), # double (double-precision) + 'var5': tb.Float32Col(dflt=4.2, pos=4), # float (single-precision) + 'var6': tb.UInt16Col(dflt=5, pos=5), # unsigned short integer + 'var7': tb.StringCol( + itemsize=1, dflt=b"e", pos=6), # 1-character String + 'var8': tb.BoolCol(dflt=True, pos=7), # boolean + 'var9': tb.ComplexCol( + itemsize=8, dflt=(0.+1.j), pos=8), # Complex single precision + 'var10': tb.ComplexCol( + itemsize=16, dflt=(1.-0.j), pos=9), # Complex double precision +} + +if hasattr(tb, 'Float16Col'): + # float (half-precision) + RecordDescriptionDict['var11'] = tb.Float16Col(dflt=6.4) +if hasattr(tb, 'Float96Col'): + # float (extended precision) + RecordDescriptionDict['var12'] = tb.Float96Col(dflt=6.4) +if hasattr(tb, 'Float128Col'): + # float (extended precision) + RecordDescriptionDict['var13'] = tb.Float128Col(dflt=6.4) +if hasattr(tb, 'Complex192Col'): + # Complex double (extended precision) + RecordDescriptionDict['var14'] = tb.ComplexCol( + itemsize=24, dflt=(1.-0.j)) +if hasattr(tb, 'Complex256Col'): + # Complex double (extended precision) + RecordDescriptionDict['var15'] = tb.ComplexCol( + itemsize=32, dflt=(1.-0.j)) + + +# Old fashion of defining tables (for testing backward compatibility) +class OldRecord(tb.IsDescription): + var1 = tb.StringCol(itemsize=4, dflt=b"abcd", pos=0) + var2 = tb.Col.from_type("int32", (), 1, pos=1) + var3 = tb.Col.from_type("int16", (), 2, pos=2) + var4 = tb.Col.from_type("float64", (), 3.1, pos=3) + var5 = tb.Col.from_type("float32", (), 4.2, pos=4) + var6 = tb.Col.from_type("uint16", (), 5, pos=5) + var7 = tb.StringCol(itemsize=1, dflt=b"e", pos=6) + var8 = tb.Col.from_type("bool", shape=(), dflt=1, pos=7) + var9 = tb.ComplexCol(itemsize=8, shape=(), dflt=(0.+1.j), pos=8) + var10 = tb.ComplexCol(itemsize=16, shape=(), dflt=(1.-0.j), pos=9) + if hasattr(tb, 'Float16Col'): + var11 = tb.Col.from_type("float16", (), 6.4) + if hasattr(tb, 'Float96Col'): + var12 = tb.Col.from_type("float96", (), 6.4) + if hasattr(tb, 'Float128Col'): + var13 = tb.Col.from_type("float128", (), 6.4) + if hasattr(tb, 'Complex192Col'): + var14 = tb.ComplexCol(itemsize=24, shape=(), dflt=(1.-0.j)) + if hasattr(tb, 'Complex256Col'): + var15 = tb.ComplexCol(itemsize=32, shape=(), dflt=(1.-0.j)) + + +class BasicTestCase(common.TempFileMixin, common.PyTablesTestCase): + # file = "test.h5" + open_mode = "w" + title = "This is the table title" + expectedrows = 100 + appendrows = 20 + compress = 0 + shuffle = 0 + bitshuffle = 0 + fletcher32 = 0 + complib = "zlib" # Default compression library + record = Record + recarrayinit = 0 + maxshort = 1 << 15 + + def setUp(self): + super().setUp() + + # Create an instance of an HDF5 Table + self.rootgroup = self.h5file.root + self.populateFile() + self.h5file.close() + + def initRecArray(self): + record = self.recordtemplate + row = record[0] + buflist = [] + # Fill the recarray + for i in range(self.expectedrows): + tmplist = [] + var1 = '%04d' % (self.expectedrows - i) + tmplist.append(var1) + var2 = i + tmplist.append(var2) + var3 = i % self.maxshort + tmplist.append(var3) + if isinstance(row['var4'], np.ndarray): + tmplist.append([float(i), float(i * i)]) + else: + tmplist.append(float(i)) + if isinstance(row['var5'], np.ndarray): + tmplist.append(np.array((float(i),)*4)) + else: + tmplist.append(float(i)) + # var6 will be like var3 but byteswaped + tmplist.append(((var3 >> 8) & 0xff) + ((var3 << 8) & 0xff00)) + var7 = var1[-1] + tmplist.append(var7) + if isinstance(row['var8'], np.ndarray): + tmplist.append([0, 10]) # should be equivalent to [0,1] + else: + tmplist.append(10) # should be equivalent to 1 + if isinstance(row['var9'], np.ndarray): + tmplist.append([0.+float(i)*1j, float(i)+0.j]) + else: + tmplist.append(float(i)+0j) + if isinstance(row['var10'], np.ndarray): + tmplist.append([float(i)+0j, 1 + float(i)*1j]) + else: + tmplist.append(1 + float(i)*1j) + if hasattr(tb, 'Float16Col'): + if isinstance(row['var11'], np.ndarray): + tmplist.append(np.array((float(i),)*4)) + else: + tmplist.append(float(i)) + if hasattr(tb, 'Float96Col'): + if isinstance(row['var12'], np.ndarray): + tmplist.append(np.array((float(i),)*4)) + else: + tmplist.append(float(i)) + if hasattr(tb, 'Float128Col'): + if isinstance(row['var13'], np.ndarray): + tmplist.append(np.array((float(i),)*4)) + else: + tmplist.append(float(i)) + if hasattr(tb, 'Complex192Col'): + if isinstance(row['var14'], np.ndarray): + tmplist.append([float(i)+0j, 1 + float(i)*1j]) + else: + tmplist.append(1 + float(i)*1j) + if hasattr(tb, 'Complex256Col'): + if isinstance(row['var15'], np.ndarray): + tmplist.append([float(i)+0j, 1 + float(i)*1j]) + else: + tmplist.append(1 + float(i)*1j) + + buflist.append(tuple(tmplist)) + + self.record = np.rec.array(buflist, dtype=record.dtype, + shape=self.expectedrows) + + def populateFile(self): + group = self.rootgroup + if self.recarrayinit: + # Initialize an starting buffer, if any + self.initRecArray() + for j in range(3): + # Create a table + filterprops = tb.Filters(complevel=self.compress, + shuffle=self.shuffle, + bitshuffle=self.bitshuffle, + fletcher32=self.fletcher32, + complib=self.complib) + if j < 2: + byteorder = sys.byteorder + else: + # table2 will be byteswapped + byteorder = {"little": "big", "big": "little"}[sys.byteorder] + table = self.h5file.create_table(group, 'table'+str(j), + self.record, + title=self.title, + filters=filterprops, + expectedrows=self.expectedrows, + byteorder=byteorder) + if not self.recarrayinit: + # Get the row object associated with the new table + row = table.row + # Fill the table + for i in range(self.expectedrows): + s = '%04d' % (self.expectedrows - i) + row['var1'] = s.encode('ascii') + row['var7'] = s[-1].encode('ascii') + # row['var7'] = ('%04d' % (self.expectedrows - i))[-1] + row['var2'] = i + row['var3'] = i % self.maxshort + if isinstance(row['var4'], np.ndarray): + row['var4'] = [float(i), float(i * i)] + else: + row['var4'] = float(i) + if isinstance(row['var8'], np.ndarray): + row['var8'] = [0, 1] + else: + row['var8'] = 1 + if isinstance(row['var9'], np.ndarray): + row['var9'] = [0.+float(i)*1j, float(i)+0.j] + else: + row['var9'] = float(i)+0.j + if isinstance(row['var10'], np.ndarray): + row['var10'] = [float(i)+0.j, 1.+float(i)*1j] + else: + row['var10'] = 1.+float(i)*1j + if isinstance(row['var5'], np.ndarray): + row['var5'] = np.array((float(i),)*4) + else: + row['var5'] = float(i) + if hasattr(tb, 'Float16Col'): + if isinstance(row['var11'], np.ndarray): + row['var11'] = np.array((float(i),)*4) + else: + row['var11'] = float(i) + if hasattr(tb, 'Float96Col'): + if isinstance(row['var12'], np.ndarray): + row['var12'] = np.array((float(i),)*4) + else: + row['var12'] = float(i) + if hasattr(tb, 'Float128Col'): + if isinstance(row['var13'], np.ndarray): + row['var13'] = np.array((float(i),)*4) + else: + row['var13'] = float(i) + if hasattr(tb, 'Complex192Col'): + if isinstance(row['var14'], np.ndarray): + row['var14'] = [float(i)+0j, 1 + float(i)*1j] + else: + row['var14'] = 1 + float(i)*1j + if hasattr(tb, 'Complex256Col'): + if isinstance(row['var15'], np.ndarray): + row['var15'] = [float(i)+0j, 1 + float(i)*1j] + else: + row['var15'] = 1 + float(i)*1j + + # var6 will be like var3 but byteswaped + row['var6'] = (((row['var3'] >> 8) & 0xff) + + ((row['var3'] << 8) & 0xff00)) + # print("Saving -->", row) + row.append() + + # Flush the buffer for this table + table.flush() + # Create a new group (descendant of group) + group2 = self.h5file.create_group(group, 'group'+str(j)) + # Iterate over this new group (group2) + group = group2 + + def test00_description(self): + """Checking table description and descriptive fields.""" + + self.h5file = tb.open_file(self.h5fname) + + tbl = self.h5file.get_node('/table0') + desc = tbl.description + + if isinstance(self.record, dict): + columns = self.record + elif isinstance(self.record, np.ndarray): + descr, _ = tb.description.descr_from_dtype(self.record.dtype) + columns = descr._v_colobjects + elif isinstance(self.record, np.dtype): + descr, _ = tb.description.descr_from_dtype(self.record) + columns = descr._v_colobjects + else: + # This is an ordinary description. + columns = self.record.columns + + # Check table and description attributes at the same time. + # These checks are only valid for non-nested tables. + + # Column names. + fix_n_column = 10 + expectedNames = ['var%d' % n for n in range(1, fix_n_column + 1)] + types = ("float16", "float96", "float128", "complex192", "complex256") + for n, typename in enumerate(types, fix_n_column + 1): + name = typename.capitalize() + 'Col' + if hasattr(tb, name): + expectedNames.append('var%d' % n) + + self.assertEqual(expectedNames, list(tbl.colnames)) + self.assertEqual(expectedNames, list(desc._v_names)) + + # Column instances. + for colname in expectedNames: + self.assertTrue(tbl.colinstances[colname] + is tbl.cols._f_col(colname)) + + # Column types. + expectedTypes = [columns[colname].dtype + for colname in expectedNames] + self.assertEqual(expectedTypes, + [tbl.coldtypes[v] for v in expectedNames]) + self.assertEqual(expectedTypes, + [desc._v_dtypes[v] for v in expectedNames]) + + # Column string types. + expectedTypes = [columns[colname].type + for colname in expectedNames] + self.assertEqual(expectedTypes, + [tbl.coltypes[v] for v in expectedNames]) + self.assertEqual(expectedTypes, + [desc._v_types[v] for v in expectedNames]) + + # Column defaults. + for v in expectedNames: + if common.verbose: + print("dflt-->", columns[v].dflt, type(columns[v].dflt)) + print("coldflts-->", tbl.coldflts[v], type(tbl.coldflts[v])) + print("desc.dflts-->", desc._v_dflts[v], + type(desc._v_dflts[v])) + self.assertTrue( + common.areArraysEqual(tbl.coldflts[v], columns[v].dflt)) + self.assertTrue( + common.areArraysEqual(desc._v_dflts[v], columns[v].dflt)) + + # Column path names. + self.assertEqual(expectedNames, list(desc._v_pathnames)) + + # Column objects. + for colName in expectedNames: + expectedCol = columns[colName] + col = desc._v_colobjects[colName] + + self.assertEqual(expectedCol.dtype, col.dtype) + self.assertEqual(expectedCol.type, col.type) + + def test01_readTable(self): + """Checking table read.""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test01_readTable..." % self.__class__.__name__) + + # Create an instance of an HDF5 Table + self.h5file = tb.open_file(self.h5fname, "r") + table = self.h5file.get_node("/table0") + + # Choose a small value for buffer size + table.nrowsinbuf = 3 + # Read the records and select those with "var2" file less than 20 + result = [rec['var2'] for rec in table.iterrows() if rec['var2'] < 20] + if common.verbose: + print("Nrows in", table._v_pathname, ":", table.nrows) + print("Last record in table ==>", rec) + print("Total selected records in table ==> ", len(result)) + nrows = self.expectedrows - 1 + rec = list(table.iterrows())[-1] + self.assertEqual((rec['var1'], rec['var2'], rec['var7']), + (b"0001", nrows, b"1")) + if isinstance(rec['var5'], np.ndarray): + self.assertTrue(common.allequal( + rec['var5'], np.array((float(nrows),)*4, np.float32))) + else: + self.assertEqual(rec['var5'], float(nrows)) + if isinstance(rec['var9'], np.ndarray): + self.assertTrue(common.allequal( + rec['var9'], np.array([0.+float(nrows)*1.j, float(nrows)+0.j], + np.complex64))) + else: + self.assertEqual((rec['var9']), float(nrows)+0.j) + self.assertEqual(len(result), 20) + + def test01a_fetch_all_fields(self): + """Checking table read (using Row.fetch_all_fields)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test01a_fetch_all_fields..." % + self.__class__.__name__) + + # Create an instance of an HDF5 Table + self.h5file = tb.open_file(self.h5fname, "r") + table = self.h5file.get_node("/table0") + + # Choose a small value for buffer size + table.nrowsinbuf = 3 + # Read the records and select those with "var2" file less than 20 + result = [rec.fetch_all_fields() for rec in table.iterrows() + if rec['var2'] < 20] + rec = result[-1] + if common.verbose: + print("Nrows in", table._v_pathname, ":", table.nrows) + print("Last record in table ==>", rec) + print("Total selected records in table ==> ", len(result)) + nrows = 20 - 1 + strnrows = "%04d" % (self.expectedrows - nrows) + strnrows = strnrows.encode('ascii') + self.assertEqual((rec['var1'], rec['var2'], rec['var7']), + (strnrows, nrows, b"1")) + if isinstance(rec['var5'], np.ndarray): + self.assertTrue(common.allequal( + rec['var5'], np.array((float(nrows),)*4, np.float32))) + else: + self.assertEqual(rec['var5'], float(nrows)) + if isinstance(rec['var9'], np.ndarray): + self.assertTrue(common.allequal( + rec['var9'], + np.array([0.+float(nrows)*1.j, float(nrows)+0.j], + np.complex64))) + else: + self.assertEqual(rec['var9'], float(nrows)+0.j) + self.assertEqual(len(result), 20) + + def test01a_integer(self): + """Checking table read (using Row[integer])""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test01a_integer..." % self.__class__.__name__) + + # Create an instance of an HDF5 Table + self.h5file = tb.open_file(self.h5fname, "r") + table = self.h5file.get_node("/table0") + + # Choose a small value for buffer size + table.nrowsinbuf = 3 + # Read the records and select those with "var2" file less than 20 + result = [rec[1] for rec in table.iterrows() + if rec['var2'] < 20] + if common.verbose: + print("Nrows in", table._v_pathname, ":", table.nrows) + print("Total selected records in table ==> ", len(result)) + print("All results ==>", result) + self.assertEqual(len(result), 20) + self.assertEqual(result, list(range(20))) + + def test01a_extslice(self): + """Checking table read (using Row[::2])""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test01a_extslice..." % self.__class__.__name__) + + # Create an instance of an HDF5 Table + self.h5file = tb.open_file(self.h5fname, "r") + table = self.h5file.get_node("/table0") + + # Choose a small value for buffer size + table.nrowsinbuf = 3 + # Read the records and select those with "var2" file less than 20 + result = [rec[::2] for rec in table.iterrows() + if rec['var2'] < 20] + rec = result[-1] + if common.verbose: + print("Nrows in", table._v_pathname, ":", table.nrows) + print("Last record in table ==>", rec) + print("Total selected records in table ==> ", len(result)) + nrows = 20 - 1 + strnrows = "%04d" % (self.expectedrows - nrows) + strnrows = strnrows.encode('ascii') + self.assertEqual(rec[:2], (strnrows, 19)) + self.assertEqual(rec[3], b'1') + if isinstance(rec[2], np.ndarray): + self.assertTrue(common.allequal( + rec[2], np.array((float(nrows),)*4, np.float32))) + else: + self.assertEqual(rec[2], nrows) + if isinstance(rec[4], np.ndarray): + self.assertTrue(common.allequal( + rec[4], np.array([0.+float(nrows)*1.j, float(nrows)+0.j], + np.complex64))) + else: + self.assertEqual(rec[4], float(nrows)+0.j) + self.assertEqual(len(result), 20) + + def test01a_nofield(self): + """Checking table read (using Row['no-field'])""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test01a_nofield..." % self.__class__.__name__) + + # Create an instance of an HDF5 Table + self.h5file = tb.open_file(self.h5fname, "r") + table = self.h5file.get_node("/table0") + + # Check that a KeyError is raised + # self.assertRaises only work with functions + # self.assertRaises(KeyError, [rec['no-field'] for rec in table]) + with self.assertRaises(KeyError): + result = [rec['no-field'] for rec in table] + if common.verbose: + print('result:', result) + + def test01a_badtypefield(self): + """Checking table read (using Row[{}])""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test01a_badtypefield..." % + self.__class__.__name__) + + # Create an instance of an HDF5 Table + self.h5file = tb.open_file(self.h5fname, "r") + table = self.h5file.get_node("/table0") + + # Check that a TypeError is raised + # self.assertRaises only work with functions + # self.assertRaises(TypeError, [rec[{}] for rec in table]) + with self.assertRaises(TypeError): + result = [rec[{}] for rec in table] + if common.verbose: + print('result:', result) + + def test01b_readTable(self): + """Checking table read and cuts (multidimensional columns case)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test01b_readTable..." % self.__class__.__name__) + + # Create an instance of an HDF5 Table + self.h5file = tb.open_file(self.h5fname, "r") + table = self.h5file.get_node("/table0") + + # Choose a small value for buffer size + table.nrowsinbuf = 3 + # Read the records and select those with "var2" file less than 20 + result = [rec['var5'] for rec in table.iterrows() + if rec['var2'] < 20] + if common.verbose: + print("Nrows in", table._v_pathname, ":", table.nrows) + print("Last record in table ==>", rec) + print("rec['var5'] ==>", rec['var5'], end=' ') + print("nrows ==>", table.nrows) + print("Total selected records in table ==> ", len(result)) + nrows = table.nrows + rec = list(table.iterrows())[-1] + if isinstance(rec['var5'], np.ndarray): + np.testing.assert_array_equal( + result[0], np.array((float(0),)*4, np.float32)) + np.testing.assert_array_equal( + result[1], np.array((float(1),)*4, np.float32)) + np.testing.assert_array_equal( + result[2], np.array((float(2),)*4, np.float32)) + np.testing.assert_array_equal( + result[3], np.array((float(3),)*4, np.float32)) + np.testing.assert_array_equal( + result[10], np.array((float(10),)*4, np.float32)) + np.testing.assert_array_equal( + rec['var5'], np.array((float(nrows-1),)*4, np.float32)) + else: + self.assertEqual(rec['var5'], float(nrows - 1)) + + # Read the records and select those with "var2" file less than 20 + result = [record['var10'] for record in table.iterrows() + if record['var2'] < 20] + if isinstance(rec['var10'], np.ndarray): + np.testing.assert_array_equal( + result[0], + np.array([float(0)+0.j, 1.+float(0)*1j], np.complex128)) + np.testing.assert_array_equal( + result[1], + np.array([float(1)+0.j, 1.+float(1)*1j], np.complex128)) + np.testing.assert_array_equal( + result[2], + np.array([float(2)+0.j, 1.+float(2)*1j], np.complex128)) + np.testing.assert_array_equal( + result[3], + np.array([float(3)+0.j, 1.+float(3)*1j], np.complex128)) + np.testing.assert_array_equal( + result[10], + np.array([float(10)+0.j, 1.+float(10)*1j], np.complex128)) + np.testing.assert_array_equal( + rec['var10'], + np.array([float(nrows-1)+0.j, 1.+float(nrows-1)*1j], + np.complex128)) + else: + self.assertEqual(rec['var10'], 1.+float(nrows-1)*1j) + self.assertEqual(len(result), 20) + + def test01c_readTable(self): + """Checking nested iterators (reading)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test01c_readTable..." % self.__class__.__name__) + + # Create an instance of an HDF5 Table + self.h5file = tb.open_file(self.h5fname, "r") + table = self.h5file.get_node("/table0") + + # Read the records and select those with "var2" file less than 20 + result = [] + for rec in table.iterrows(stop=2): + for rec2 in table.iterrows(stop=2): + if rec2['var2'] < 20: + result.append([rec['var2'], rec2['var2']]) + if common.verbose: + print("result ==>", result) + + self.assertEqual(result, [[0, 0], [0, 1], [1, 0], [1, 1]]) + + def test01d_readTable(self): + """Checking nested iterators (reading, mixed conditions)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test01d_readTable..." % self.__class__.__name__) + + # Create an instance of an HDF5 Table + self.h5file = tb.open_file(self.h5fname, "r") + table = self.h5file.get_node("/table0") + + # Read the records and select those with "var2" file less than 20 + result = [] + for rec in table.iterrows(stop=2): + for rec2 in table.where('var2 < 20', stop=2): + result.append([rec['var2'], rec2['var2']]) + if common.verbose: + print("result ==>", result) + + self.assertEqual(result, [[0, 0], [0, 1], [1, 0], [1, 1]]) + + def test01e_readTable(self): + """Checking nested iterators (reading, both conditions)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test01e_readTable..." % self.__class__.__name__) + + # Create an instance of an HDF5 Table + self.h5file = tb.open_file(self.h5fname, "r") + table = self.h5file.get_node("/table0") + + # Read the records and select those with "var2" file less than 20 + result = [] + for rec in table.where('var3 < 2'): + for rec2 in table.where('var2 < 3'): + result.append([rec['var2'], rec2['var3']]) + if common.verbose: + print("result ==>", result) + + self.assertEqual(result, + [[0, 0], [0, 1], [0, 2], [1, 0], [1, 1], [1, 2]]) + + def test01f_readTable(self): + """Checking nested iterators (reading, break in the loop)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test01f_readTable..." % self.__class__.__name__) + + # Create an instance of an HDF5 Table + self.h5file = tb.open_file(self.h5fname, "r") + table = self.h5file.get_node("/table0") + + # Read the records and select those with "var2" file less than 20 + result = [] + for rec in table.where('var3 < 2'): + for rec2 in table.where('var2 < 4'): + if rec2['var2'] >= 3: + break + result.append([rec['var2'], rec2['var3']]) + if common.verbose: + print("result ==>", result) + + self.assertEqual(result, + [[0, 0], [0, 1], [0, 2], [1, 0], [1, 1], [1, 2]]) + + def test01g_readTable(self): + """Checking iterator with an evanescent table.""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test01g_readTable..." % self.__class__.__name__) + + # Create an instance of an HDF5 Table + self.h5file = tb.open_file(self.h5fname, "r") + + # Read from an evanescent table + result = [rec['var2'] for rec in self.h5file.get_node("/table0") + if rec['var2'] < 20] + + self.assertEqual(len(result), 20) + + def test02_AppendRows(self): + """Checking whether appending record rows works or not.""" + + # Now, open it, but in "append" mode + self.h5file = tb.open_file(self.h5fname, mode="a") + self.rootgroup = self.h5file.root + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test02_AppendRows..." % self.__class__.__name__) + + # Get a table + table = self.h5file.get_node("/group0/table1") + # Get their row object + row = table.row + if common.verbose: + print("Nrows in old", table._v_pathname, ":", table.nrows) + print("Record Format ==>", table.description._v_nested_formats) + print("Record Size ==>", table.rowsize) + # Append some rows + for i in range(self.appendrows): + s = '%04d' % (self.appendrows - i) + row['var1'] = s.encode('ascii') + row['var7'] = s[-1].encode('ascii') + row['var2'] = i + row['var3'] = i % self.maxshort + if isinstance(row['var4'], np.ndarray): + row['var4'] = [float(i), float(i * i)] + else: + row['var4'] = float(i) + if isinstance(row['var8'], np.ndarray): + row['var8'] = [0, 1] + else: + row['var8'] = 1 + if isinstance(row['var9'], np.ndarray): + row['var9'] = [0.+float(i)*1j, float(i)+0.j] + else: + row['var9'] = float(i)+0.j + if isinstance(row['var10'], np.ndarray): + row['var10'] = [float(i)+0.j, 1.+float(i)*1j] + else: + row['var10'] = 1.+float(i)*1j + if isinstance(row['var5'], np.ndarray): + row['var5'] = np.array((float(i),)*4) + else: + row['var5'] = float(i) + if hasattr(tb, 'Float16Col'): + if isinstance(row['var11'], np.ndarray): + row['var11'] = np.array((float(i),)*4) + else: + row['var11'] = float(i) + if hasattr(tb, 'Float96Col'): + if isinstance(row['var12'], np.ndarray): + row['var12'] = np.array((float(i),)*4) + else: + row['var12'] = float(i) + if hasattr(tb, 'Float128Col'): + if isinstance(row['var13'], np.ndarray): + row['var13'] = np.array((float(i),)*4) + else: + row['var13'] = float(i) + if hasattr(tb, 'Complex192Col'): + if isinstance(row['var14'], np.ndarray): + row['var14'] = [float(i)+0j, 1 + float(i)*1j] + else: + row['var14'] = 1 + float(i)*1j + if hasattr(tb, 'Complex256Col'): + if isinstance(row['var15'], np.ndarray): + row['var15'] = [float(i)+0j, 1 + float(i)*1j] + else: + row['var15'] = 1 + float(i)*1j + + row.append() + + # Flush the buffer for this table and read it + table.flush() + result = [r['var2'] for r in table.iterrows() if r['var2'] < 20] + + nrows = self.appendrows - 1 + row = list(table.iterrows())[-1] + self.assertEqual((row['var1'], row['var2'], row['var7']), + (b"0001", nrows, b"1")) + if isinstance(row['var5'], np.ndarray): + self.assertTrue(common.allequal( + row['var5'], np.array((float(nrows),)*4, np.float32))) + else: + self.assertEqual(row['var5'], float(nrows)) + if self.appendrows <= 20: + add = self.appendrows + else: + add = 20 + self.assertEqual(len(result), 20 + add) # because we appended new rows + + # This test has been commented out because appending records without + # flushing them explicitely is being warned from now on. + # F. Alted 2006-08-03 + def _test02a_AppendRows(self): + """Checking appending records without flushing explicitely.""" + + # Now, open it, but in "append" mode + self.h5file = tb.open_file(self.h5fname, mode="a") + self.rootgroup = self.h5file.root + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test02a_AppendRows..." % self.__class__.__name__) + + group = self.rootgroup + for group_i in range(3): + # Get a table + table = self.h5file.get_node(group, 'table'+str(group_i)) + # Get the next group + group = self.h5file.get_node(group, 'group'+str(group_i)) + # Get their row object + row = table.row + if common.verbose: + print("Nrows in old", table._v_pathname, ":", table.nrows) + print("Record Format ==>", table.description._v_nested_formats) + print("Record Size ==>", table.rowsize) + # Append some rows + for row_i in range(self.appendrows): + row['var1'] = '%04d' % (self.appendrows - row_i) + row['var7'] = row['var1'][-1] + row['var2'] = row_i + row['var3'] = row_i % self.maxshort + if isinstance(row['var4'], np.ndarray): + row['var4'] = [float(row_i), float(row_i * row_i)] + else: + row['var4'] = float(row_i) + if isinstance(row['var8'], np.ndarray): + row['var8'] = [0, 1] + else: + row['var8'] = 1 + if isinstance(row['var9'], np.ndarray): + row['var9'] = [0.+float(row_i)*1j, float(row_i)+0.j] + else: + row['var9'] = float(row_i)+0.j + if isinstance(row['var10'], np.ndarray): + row['var10'] = [float(row_i)+0.j, 1.+float(row_i)*1j] + else: + row['var10'] = 1.+float(row_i)*1j + if isinstance(row['var5'], np.ndarray): + row['var5'] = np.array((float(row_i),)*4) + else: + row['var5'] = float(row_i) + if hasattr(tb, 'Float16Col'): + if isinstance(row['var11'], np.ndarray): + row['var11'] = np.array((float(row_i),)*4) + else: + row['var11'] = float(row_i) + if hasattr(tb, 'Float96Col'): + if isinstance(row['var12'], np.ndarray): + row['var12'] = np.array((float(row_i),)*4) + else: + row['var12'] = float(row_i) + if hasattr(tb, 'Float128Col'): + if isinstance(row['var13'], np.ndarray): + row['var13'] = np.array((float(row_i),)*4) + else: + row['var13'] = float(row_i) + if hasattr(tb, 'Complex192Col'): + if isinstance(row['var14'], np.ndarray): + row['var14'] = [float(row_i)+0j, 1 + float(row_i)*1j] + else: + row['var14'] = 1 + float(row_i)*1j + if hasattr(tb, 'Complex256Col'): + if isinstance(row['var15'], np.ndarray): + row['var15'] = [float(row_i)+0j, 1 + float(row_i)*1j] + else: + row['var15'] = 1 + float(row_i)*1j + + row.append() + table.flush() + + # Close the file and re-open it. + self.h5file.close() + + self.h5file = tb.open_file(self.h5fname, mode="a") + table = self.h5file.root.table0 + # Flush the buffer for this table and read it + result = [r['var2'] for r in table.iterrows() if r['var2'] < 20] + + nrows = self.appendrows - 1 + self.assertEqual((row['var1'], row['var2'], row['var7']), + ("0001", nrows, "1")) + if isinstance(row['var5'], np.ndarray): + self.assertTrue(common.allequal( + row['var5'], np.array((float(nrows),)*4, np.float32))) + else: + self.assertEqual(row['var5'], float(nrows)) + if self.appendrows <= 20: + add = self.appendrows + else: + add = 20 + self.assertEqual(len(result), 20 + add) # because we appended new rows + + def test02b_AppendRows(self): + """Checking whether appending *and* reading rows works or not""" + + # Now, open it, but in "append" mode + self.h5file = tb.open_file(self.h5fname, mode="a") + self.rootgroup = self.h5file.root + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test02b_AppendRows..." % self.__class__.__name__) + + # Get a table + table = self.h5file.get_node("/group0/table1") + if common.verbose: + print("Nrows in old", table._v_pathname, ":", table.nrows) + print("Record Format ==>", table.description._v_nested_formats) + print("Record Size ==>", table.rowsize) + # Set a small number of buffer to make this test faster + table.nrowsinbuf = 3 + # Get their row object + row = table.row + # Append some rows (3 * table.nrowsinbuf is enough for + # checking purposes) + for i in range(3 * table.nrowsinbuf): + s = '%04d' % (self.appendrows - i) + row['var1'] = s.encode('ascii') + row['var7'] = s[-1].encode('ascii') + # row['var7'] = table.cols['var1'][i][-1] + row['var2'] = i + row['var3'] = i % self.maxshort + if isinstance(row['var4'], np.ndarray): + row['var4'] = [float(i), float(i * i)] + else: + row['var4'] = float(i) + if isinstance(row['var8'], np.ndarray): + row['var8'] = [0, 1] + else: + row['var8'] = 1 + if isinstance(row['var9'], np.ndarray): + row['var9'] = [0.+float(i)*1j, float(i)+0.j] + else: + row['var9'] = float(i)+0.j + if isinstance(row['var10'], np.ndarray): + row['var10'] = [float(i)+0.j, 1.+float(i)*1j] + else: + row['var10'] = 1.+float(i)*1j + if isinstance(row['var5'], np.ndarray): + row['var5'] = np.array((float(i),)*4) + else: + row['var5'] = float(i) + if hasattr(tb, 'Float16Col'): + if isinstance(row['var11'], np.ndarray): + row['var11'] = np.array((float(i),)*4) + else: + row['var11'] = float(i) + if hasattr(tb, 'Float96Col'): + if isinstance(row['var12'], np.ndarray): + row['var12'] = np.array((float(i),)*4) + else: + row['var12'] = float(i) + if hasattr(tb, 'Float128Col'): + if isinstance(row['var13'], np.ndarray): + row['var13'] = np.array((float(i),)*4) + else: + row['var13'] = float(i) + if hasattr(tb, 'Complex192Col'): + if isinstance(row['var14'], np.ndarray): + row['var14'] = [float(i)+0j, 1 + float(i)*1j] + else: + row['var14'] = 1 + float(i)*1j + if hasattr(tb, 'Complex256Col'): + if isinstance(row['var15'], np.ndarray): + row['var15'] = [float(i)+0j, 1 + float(i)*1j] + else: + row['var15'] = 1 + float(i)*1j + + row.append() + # the next call can mislead the counters + result = [row2['var2'] for row2 in table] + # warning! the next will result into wrong results + # result = [ row['var2'] for row in table ] + # This is because the iterator for writing and for reading + # cannot be shared! + + # Do not flush the buffer for this table and try to read it + # We are forced now to flush tables after append operations + # because of unsolved issues with the LRU cache that are too + # difficult to track. + # F. Alted 2006-08-03 + table.flush() + result = [ + row3['var2'] for row3 in table.iterrows() if row3['var2'] < 20 + ] + if common.verbose: + print("Result length ==>", len(result)) + print("Result contents ==>", result) + self.assertEqual(len(result), 20 + 3 * table.nrowsinbuf) + self.assertEqual(result, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, + 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, + 0, 1, 2, 3, 4, 5, 6, 7, 8]) + # Check consistency of I/O buffers when doing mixed I/O operations + # That is, the next should work in these operations + # row['var1'] = '%04d' % (self.appendrows - i) + # row['var7'] = row['var1'][-1] + result7 = [ + row4['var7'] for row4 in table.iterrows() if row4['var2'] < 20 + ] + if common.verbose: + print("Result7 length ==>", len(result7)) + print("Result7 contents ==>", result7) + self.assertEqual( + result7, + [b'0', b'9', b'8', b'7', b'6', b'5', b'4', b'3', b'2', b'1', + b'0', b'9', b'8', b'7', b'6', b'5', b'4', b'3', b'2', b'1', + b'0', b'9', b'8', b'7', b'6', b'5', b'4', b'3', b'2']) + + def test02d_AppendRows(self): + """Checking appending using the same Row object after flushing.""" + + # This test is kind of magic, but it is a good sanity check anyway. + + # Now, open it, but in "append" mode + self.h5file = tb.open_file(self.h5fname, mode="a") + self.rootgroup = self.h5file.root + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test02d_AppendRows..." % self.__class__.__name__) + + # Get a table + table = self.h5file.get_node("/group0/table1") + if common.verbose: + print("Nrows in old", table._v_pathname, ":", table.nrows) + print("Record Format ==>", table.description._v_nested_formats) + print("Record Size ==>", table.rowsize) + # Set a small number of buffer to make this test faster + table.nrowsinbuf = 3 + # Get their row object + row = table.row + # Append some rows + for i in range(10): + row['var2'] = 100 + i + row.append() + # Force a flush + table.flush() + # Add new rows + for i in range(9): + row['var2'] = 110 + i + row.append() + table.flush() # XXX al eliminar... + result = [ + r['var2'] for r in table.iterrows() if 100 <= r['var2'] < 120 + ] + if common.verbose: + print("Result length ==>", len(result)) + print("Result contents ==>", result) + if table.nrows > 119: + # Case for big tables + self.assertEqual(len(result), 39) + self.assertEqual(result, + [100, 101, 102, 103, 104, 105, 106, 107, 108, 109, + 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, + 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, + 110, 111, 112, 113, 114, 115, 116, 117, 118]) + else: + self.assertEqual(len(result), 19) + self.assertEqual(result, + [100, 101, 102, 103, 104, 105, 106, 107, 108, 109, + 110, 111, 112, 113, 114, 115, 116, 117, 118]) + + def test02e_AppendRows(self): + """Checking appending using the Row of an unreferenced table.""" + # See ticket #94 (http://www.pytables.org/trac/ticket/94). + + # Reopen the file in append mode. + self.h5file = tb.open_file(self.h5fname, mode='a') + + # Get the row handler which will outlive the reference to the table. + table = self.h5file.get_node('/group0/table1') + oldnrows = table.nrows + row = table.row + + # Few appends are made to avoid flushing the buffers in ``row``. + + # First case: append to an alive (referenced) table. + row.append() + table.flush() + newnrows = table.nrows + self.assertEqual(newnrows, oldnrows + 1, + "Append to alive table failed.") + + if self.h5file._node_manager.cache.nslots == 0: + # Skip this test from here on because the second case + # won't work when thereis not a node cache. + return + + # Second case: append to a dead (unreferenced) table. + del table + row.append() + table = self.h5file.get_node('/group0/table1') + table.flush() + newnrows = table.nrows + self.assertEqual(newnrows, oldnrows + 2, + "Append to dead table failed.") + + # CAVEAT: The next test only works for tables with rows < 2**15 + def test03_endianess(self): + """Checking if table is endianess aware.""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test03_endianess..." % self.__class__.__name__) + + # Create an instance of an HDF5 Table + self.h5file = tb.open_file(self.h5fname, "r") + table = self.h5file.get_node("/group0/group1/table2") + + # Read the records and select the ones with "var3" column less than 20 + result = [rec['var2'] for rec in table.iterrows() if rec['var3'] < 20] + if common.verbose: + print("Nrows in", table._v_pathname, ":", table.nrows) + print("On-disk byteorder ==>", table.byteorder) + print("Last record in table ==>", rec) + print("Selected records ==>", result) + print("Total selected records in table ==>", len(result)) + nrows = self.expectedrows - 1 + self.assertEqual(table.byteorder, + {"little": "big", "big": "little"}[sys.byteorder]) + rec = list(table.iterrows())[-1] + self.assertEqual((rec['var1'], rec['var3']), (b"0001", nrows)) + self.assertEqual(len(result), 20) + + def test04_delete(self): + """Checking whether a single row can be deleted.""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test04_delete..." % self.__class__.__name__) + + # Create an instance of an HDF5 Table + self.h5file = tb.open_file(self.h5fname, "a") + table = self.h5file.get_node("/table0") + + # Read the records and select the ones with "var2" column less than 20 + result = [r['var2'] for r in table.iterrows() if r['var2'] < 20] + + if common.verbose: + print("Nrows in", table._v_pathname, ":", table.nrows) + print("Last selected value ==>", result[-1]) + print("Total selected records in table ==>", len(result)) + + nrows = table.nrows + table.nrowsinbuf = 3 # small value of the buffer + # Delete the twenty-th row + table.remove_rows(19, 20) + + # Re-read the records + result2 = [r['var2'] for r in table.iterrows() if r['var2'] < 20] + + if common.verbose: + print("Nrows in", table._v_pathname, ":", table.nrows) + print("Last selected value ==>", result2[-1]) + print("Total selected records in table ==>", len(result2)) + + self.assertEqual(table.nrows, nrows - 1) + self.assertEqual(table.shape, (nrows - 1,)) + # Check that the new list is smaller than the original one + self.assertEqual(len(result), len(result2) + 1) + self.assertEqual(result[:-1], result2) + + def test04a_delete(self): + """Checking whether a single row can be deleted.""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test04_delete..." % self.__class__.__name__) + + # Create an instance of an HDF5 Table + self.h5file = tb.open_file(self.h5fname, "a") + table = self.h5file.get_node("/table0") + + # Read the records and select the ones with "var2" column less than 20 + result = [r['var2'] for r in table.iterrows() if r['var2'] < 20] + + if common.verbose: + print("Nrows in", table._v_pathname, ":", table.nrows) + print("Last selected value ==>", result[-1]) + print("Total selected records in table ==>", len(result)) + + nrows = table.nrows + table.nrowsinbuf = 3 # small value of the buffer + # Delete the twenty-th row + table.remove_row(19) + + # Re-read the records + result2 = [r['var2'] for r in table.iterrows() if r['var2'] < 20] + + if common.verbose: + print("Nrows in", table._v_pathname, ":", table.nrows) + print("Last selected value ==>", result2[-1]) + print("Total selected records in table ==>", len(result2)) + + self.assertEqual(table.nrows, nrows - 1) + self.assertEqual(table.shape, (nrows - 1,)) + # Check that the new list is smaller than the original one + self.assertEqual(len(result), len(result2) + 1) + self.assertEqual(result[:-1], result2) + + def test04b_delete(self): + """Checking whether a range of rows can be deleted.""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test04b_delete..." % self.__class__.__name__) + + # Create an instance of an HDF5 Table + self.h5file = tb.open_file(self.h5fname, "a") + table = self.h5file.get_node("/table0") + + # Read the records and select the ones with "var2" column less than 20 + result = [r['var2'] for r in table.iterrows() if r['var2'] < 20] + + if common.verbose: + print("Nrows in", table._v_pathname, ":", table.nrows) + print("Last selected value ==>", result[-1]) + print("Total selected records in table ==>", len(result)) + + nrows = table.nrows + table.nrowsinbuf = 4 # small value of the buffer + # Delete the last ten rows + table.remove_rows(10, 20) + + # Re-read the records + result2 = [r['var2'] for r in table.iterrows() if r['var2'] < 20] + + if common.verbose: + print("Nrows in", table._v_pathname, ":", table.nrows) + print("Last selected value ==>", result2[-1]) + print("Total selected records in table ==>", len(result2)) + + self.assertEqual(table.nrows, nrows - 10) + self.assertEqual(table.shape, (nrows - 10,)) + # Check that the new list is smaller than the original one + self.assertEqual(len(result), len(result2) + 10) + self.assertEqual(result[:10], result2) + + def test04c_delete(self): + """Checking whether removing a bad range of rows is detected.""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test04c_delete..." % self.__class__.__name__) + + # Create an instance of an HDF5 Table + self.h5file = tb.open_file(self.h5fname, "a") + table = self.h5file.get_node("/table0") + + # Read the records and select the ones with "var2" column less than 20 + result = [r['var2'] for r in table.iterrows() if r['var2'] < 20] + + nrows = table.nrows + table.nrowsinbuf = 5 # small value of the buffer + # Delete a too large range of rows + table.remove_rows(10, nrows + 100) + + # Re-read the records + result2 = [r['var2'] for r in table.iterrows() if r['var2'] < 20] + + if common.verbose: + print("Nrows in", table._v_pathname, ":", table.nrows) + print("Last selected value ==>", result2[-1]) + print("Total selected records in table ==>", len(result2)) + + self.assertEqual(table.nrows, 10) + self.assertEqual(table.shape, (10,)) + # Check that the new list is smaller than the original one + self.assertEqual(len(result), len(result2) + 10) + self.assertEqual(result[:10], result2) + + def test04d_delete(self): + """Checking whether removing rows several times at once is working.""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test04d_delete..." % self.__class__.__name__) + + # Create an instance of an HDF5 Table + self.h5file = tb.open_file(self.h5fname, "a") + table = self.h5file.get_node("/table0") + + # Read the records and select the ones with "var2" column less than 20 + result = [r['var2'] for r in table if r['var2'] < 20] + + nrows = table.nrows + nrowsinbuf = table.nrowsinbuf + table.nrowsinbuf = 6 # small value of the buffer + # Delete some rows + table.remove_rows(10, 15) + # It's necessary to restore the value of buffer to use the row object + # afterwards... + table.nrowsinbuf = nrowsinbuf + + # Append some rows + row = table.row + for i in range(10, 15): + row['var1'] = '%04d' % (self.appendrows - i) + # This line gives problems on Windows. Why? + # row['var7'] = row['var1'][-1] + row['var2'] = i + row['var3'] = i % self.maxshort + if isinstance(row['var4'], np.ndarray): + row['var4'] = [float(i), float(i * i)] + else: + row['var4'] = float(i) + if isinstance(row['var8'], np.ndarray): + row['var8'] = [0, 1] + else: + row['var8'] = 1 + if isinstance(row['var9'], np.ndarray): + row['var9'] = [0.+float(i)*1j, float(i)+0.j] + else: + row['var9'] = float(i)+0.j + if isinstance(row['var10'], np.ndarray): + row['var10'] = [float(i)+0.j, 1.+float(i)*1j] + else: + row['var10'] = 1.+float(i)*1j + if isinstance(row['var5'], np.ndarray): + row['var5'] = np.array((float(i),)*4) + else: + row['var5'] = float(i) + if hasattr(tb, 'Float16Col'): + if isinstance(row['var11'], np.ndarray): + row['var11'] = np.array((float(i),)*4) + else: + row['var11'] = float(i) + if hasattr(tb, 'Float96Col'): + if isinstance(row['var12'], np.ndarray): + row['var12'] = np.array((float(i),)*4) + else: + row['var12'] = float(i) + if hasattr(tb, 'Float128Col'): + if isinstance(row['var13'], np.ndarray): + row['var13'] = np.array((float(i),)*4) + else: + row['var13'] = float(i) + if hasattr(tb, 'Complex192Col'): + if isinstance(row['var14'], np.ndarray): + row['var14'] = [float(i)+0j, 1 + float(i)*1j] + else: + row['var14'] = 1 + float(i)*1j + if hasattr(tb, 'Complex256Col'): + if isinstance(row['var15'], np.ndarray): + row['var15'] = [float(i)+0j, 1 + float(i)*1j] + else: + row['var15'] = 1 + float(i)*1j + + row.append() + # Flush the buffer for this table + table.flush() + + # Delete 5 rows more + table.remove_rows(5, 10) + + # Re-read the records + result2 = [r['var2'] for r in table if r['var2'] < 20] + + if common.verbose: + print("Nrows in", table._v_pathname, ":", table.nrows) + print("Last selected value ==>", result2[-1]) + print("Total selected records in table ==>", len(result2)) + + self.assertEqual(table.nrows, nrows - 5) + self.assertEqual(table.shape, (nrows - 5,)) + # Check that the new list is smaller than the original one + self.assertEqual(len(result), len(result2) + 5) + # The last values has to be equal + self.assertEqual(result[10:15], result2[10:15]) + + def test04e_delete(self): + """Checking whether all rows can be deleted.""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test04e_delete..." % self.__class__.__name__) + + # Create an instance of an HDF5 Table + self.h5file = tb.open_file(self.h5fname, "a") + table = self.h5file.get_node("/table0") + + # Read all records + result = [r['var2'] for r in table.iterrows()] + + if common.verbose: + print("Nrows in", table._v_pathname, ":", table.nrows) + print("Last selected value ==>", result[-1]) + print("Total selected records in table ==>", len(result)) + + table.nrowsinbuf = 4 # small value of the buffer + # Delete all rows + table.remove_rows(0, self.expectedrows) + + # Re-read the records + result2 = [r['var2'] for r in table.iterrows()] + + if common.verbose: + print("Nrows in", table._v_pathname, ":", table.nrows) + print("Total selected records in table ==>", len(result2)) + + self.assertEqual(table.nrows, 0) + self.assertEqual(table.shape, (0,)) + self.assertEqual(len(result2), 0) + + def test04f_delete(self): + """Checking whether all rows can be deleted.""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test04e_delete..." % self.__class__.__name__) + + # Create an instance of an HDF5 Table + self.h5file = tb.open_file(self.h5fname, "a") + table = self.h5file.get_node("/table0") + + # Read all records + result = [r['var2'] for r in table.iterrows()] + + if common.verbose: + print("Nrows in", table._v_pathname, ":", table.nrows) + print("Last selected value ==>", result[-1]) + print("Total selected records in table ==>", len(result)) + + table.nrowsinbuf = 4 # small value of the buffer + # Delete 100 rows + table.remove_rows() + + # Re-read the records + result2 = [r['var2'] for r in table.iterrows()] + + if common.verbose: + print("Nrows in", table._v_pathname, ":", table.nrows) + print("Total selected records in table ==>", len(result2)) + + self.assertEqual(table.nrows, 0) + self.assertEqual(table.shape, (0,)) + self.assertEqual(len(result2), 0) + + def test04g_delete(self): + """Checking whether rows can be deleted with a step parameter.""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test04e_delete..." % self.__class__.__name__) + + # Create an instance of an HDF5 Table + self.h5file = tb.open_file(self.h5fname, "a") + table = self.h5file.get_node("/table0") + + # Read all records + result = [r['var2'] for r in table.iterrows()] + + if common.verbose: + print("Nrows in", table._v_pathname, ":", table.nrows) + print("Last selected value ==>", result[-1]) + print("Total selected records in table ==>", len(result)) + + nrows = table.nrows + table.nrowsinbuf = 4 # small value of the buffer + # Delete 100 rows + table.remove_rows(0, nrows+1, 5) + + # Re-read the records + result2 = [r['var2'] for r in table.iterrows()] + + if common.verbose: + print("Nrows in", table._v_pathname, ":", table.nrows) + print("Total selected records in table ==>", len(result2)) + + outnrows = nrows - nrows // 5 + self.assertEqual(table.nrows, outnrows) + self.assertEqual(table.shape, (outnrows,)) + self.assertEqual(len(result2), outnrows) + + def test05_filtersTable(self): + """Checking tablefilters.""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test05_filtersTable..." % + self.__class__.__name__) + + # Create an instance of an HDF5 Table + self.h5file = tb.open_file(self.h5fname, "r") + table = self.h5file.get_node("/table0") + + # Check filters: + if self.compress != table.filters.complevel and common.verbose: + print("Error in compress. Class:", self.__class__.__name__) + print("self, table:", self.compress, table.filters.complevel) + self.assertEqual(table.filters.complevel, self.compress) + if self.compress > 0 and tb.which_lib_version(self.complib): + self.assertEqual(table.filters.complib, self.complib) + if self.shuffle != table.filters.shuffle and common.verbose: + print("Error in shuffle. Class:", self.__class__.__name__) + print("self, table:", self.shuffle, table.filters.shuffle) + self.assertEqual(self.shuffle, table.filters.shuffle) + if self.bitshuffle != table.filters.bitshuffle and common.verbose: + print("Error in bitshuffle. Class:", self.__class__.__name__) + print("self, table:", self.bitshuffle, table.filters.bitshuffle) + self.assertEqual(self.bitshuffle, table.filters.bitshuffle) + if self.fletcher32 != table.filters.fletcher32 and common.verbose: + print("Error in fletcher32. Class:", self.__class__.__name__) + print("self, table:", self.fletcher32, table.filters.fletcher32) + self.assertEqual(self.fletcher32, table.filters.fletcher32) + + def test06_attributes(self): + self.h5file = tb.open_file(self.h5fname) + obj = self.h5file.get_node('/table0') + + self.assertEqual(obj.flavor, 'numpy') + self.assertEqual(obj.shape, (self.expectedrows,)) + self.assertEqual(obj.ndim, 1) + self.assertEqual(obj.nrows, self.expectedrows) + + def test07_out_of_order_members(self): + # If members are stored 'out of order' make sure they are loaded + # correctly + self.h5file = tb.open_file( + common.test_filename("out_of_order_types.h5")) + row = self.h5file.get_node('/group/table')[0] + + self.assertEqual(row[0], b'*'*14) + self.assertEqual(row[1], b'-'*9) + self.assertEqual(row[2], b'.'*4) + + +class BasicWriteTestCase(BasicTestCase): + title = "BasicWrite" + + +class OldRecordBasicWriteTestCase(BasicTestCase): + title = "OldRecordBasicWrite" + record = OldRecord + + +class DictWriteTestCase(BasicTestCase): + # This checks also unidimensional arrays as columns + title = "DictWrite" + record = RecordDescriptionDict + nrows = 21 + nrowsinbuf = 3 # Choose a small value for the buffer size + start = 0 + stop = 10 + step = 3 + + +# Pure NumPy dtype +class NumPyDTWriteTestCase(BasicTestCase): + title = "NumPyDTWriteTestCase" + formats = "a4,i4,i2,2f8,f4,i2,a1,b1,c8,c16".split(',') + names = 'var1,var2,var3,var4,var5,var6,var7,var8,var9,var10'.split(',') + + if hasattr(tb, 'Float16Col'): + formats.append('f2') + names.append('var11') + if hasattr(tb, 'Float96Col'): + formats.append('f12') + names.append('var12') + if hasattr(tb, 'Float128Col'): + formats.append('f16') + names.append('var13') + if hasattr(tb, 'Complex192Col'): + formats.append('c24') + names.append('var14') + if hasattr(tb, 'Complex256Col'): + formats.append('c32') + names.append('var15') + + record = np.dtype(','.join(formats)) + record.names = names + + +class RecArrayOneWriteTestCase(BasicTestCase): + title = "RecArrayOneWrite" + formats = "a4,i4,i2,2f8,f4,i2,a1,b1,c8,c16".split(',') + names = 'var1,var2,var3,var4,var5,var6,var7,var8,var9,var10'.split(',') + + if hasattr(tb, 'Float16Col'): + formats.append('f2') + names.append('var11') + if hasattr(tb, 'Float96Col'): + formats.append('f12') + names.append('var12') + if hasattr(tb, 'Float128Col'): + formats.append('f16') + names.append('var13') + if hasattr(tb, 'Complex192Col'): + formats.append('c24') + names.append('var14') + if hasattr(tb, 'Complex256Col'): + formats.append('c32') + names.append('var15') + + record = np.rec.array( + None, shape=0, formats=','.join(formats), names=names) + + +class RecArrayTwoWriteTestCase(BasicTestCase): + title = "RecArrayTwoWrite" + expectedrows = 100 + recarrayinit = 1 + formats = "a4,i4,i2,2f8,f4,i2,a1,b1,c8,c16".split(',') + names = 'var1,var2,var3,var4,var5,var6,var7,var8,var9,var10'.split(',') + + if hasattr(tb, 'Float16Col'): + formats.append('f2') + names.append('var11') + if hasattr(tb, 'Float96Col'): + formats.append('f12') + names.append('var12') + if hasattr(tb, 'Float128Col'): + formats.append('f16') + names.append('var13') + if hasattr(tb, 'Complex192Col'): + formats.append('c24') + names.append('var14') + if hasattr(tb, 'Complex256Col'): + formats.append('c32') + names.append('var15') + + recordtemplate = np.rec.array( + None, shape=1, formats=','.join(formats), names=names) + + +class RecArrayThreeWriteTestCase(BasicTestCase): + title = "RecArrayThreeWrite" + expectedrows = 100 + recarrayinit = 1 + formats = "a4,i4,i2,2f8,f4,i2,a1,b1,c8,c16".split(',') + names = 'var1,var2,var3,var4,var5,var6,var7,var8,var9,var10'.split(',') + + if hasattr(tb, 'Float16Col'): + formats.append('f2') + names.append('var11') + if hasattr(tb, 'Float96Col'): + formats.append('f12') + names.append('var12') + if hasattr(tb, 'Float128Col'): + formats.append('f16') + names.append('var13') + if hasattr(tb, 'Complex192Col'): + formats.append('c24') + names.append('var14') + if hasattr(tb, 'Complex256Col'): + formats.append('c32') + names.append('var15') + + recordtemplate = np.rec.array( + None, shape=1, formats=','.join(formats), names=names) + + +class RecArrayAlignedWriteTestCase(BasicTestCase): + title = "RecArrayThreeWrite" + expectedrows = 100 + recarrayinit = 1 + formats = "a4,i4,i2,2f8,f4,i2,a1,b1,c8,c16".split(',') + names = 'var1,var2,var3,var4,var5,var6,var7,var8,var9,var10'.split(',') + + if hasattr(tb, 'Float16Col'): + formats.append('f2') + names.append('var11') + if hasattr(tb, 'Float96Col'): + formats.append('f12') + names.append('var12') + if hasattr(tb, 'Float128Col'): + formats.append('f16') + names.append('var13') + if hasattr(tb, 'Complex192Col'): + formats.append('c24') + names.append('var14') + if hasattr(tb, 'Complex256Col'): + formats.append('c32') + names.append('var15') + + recordtemplate = np.rec.array( + None, shape=1, formats=','.join(formats), names=names, aligned=True) + + +@common.unittest.skipIf(not common.blosc_avail, + 'BLOSC compression library not available') +class CompressBloscTablesTestCase(BasicTestCase): + title = "CompressBloscTables" + compress = 6 + complib = "blosc" + + +@common.unittest.skipIf(not common.blosc_avail, + 'BLOSC compression library not available') +class CompressBloscShuffleTablesTestCase(BasicTestCase): + title = "CompressBloscTables" + compress = 1 + shuffle = 1 + complib = "blosc" + + +@common.unittest.skipIf(not common.blosc_avail, + 'BLOSC compression library not available') +@common.unittest.skipIf( + common.blosc_version < common.min_blosc_bitshuffle_version, + f'BLOSC >= {common.min_blosc_bitshuffle_version} required') +class CompressBloscBitShuffleTablesTestCase(BasicTestCase): + title = "CompressBloscBitShuffleTables" + compress = 1 + shuffle = 0 + bitshuffle = 1 + complib = "blosc:blosclz" + + +@common.unittest.skipIf(not common.blosc_avail, + 'BLOSC compression library not available') +class CompressBloscBloscLZTablesTestCase(BasicTestCase): + title = "CompressBloscLZTables" + compress = 1 + shuffle = 1 + complib = "blosc:blosclz" + + +@common.unittest.skipIf(not common.blosc_avail, + 'BLOSC compression library not available') +@common.unittest.skipIf( + 'lz4' not in tb.blosc_compressor_list(), 'lz4 required') +class CompressBloscLZ4TablesTestCase(BasicTestCase): + title = "CompressLZ4Tables" + compress = 1 + shuffle = 1 + complib = "blosc:lz4" + + +@common.unittest.skipIf(not common.blosc_avail, + 'BLOSC compression library not available') +@common.unittest.skipIf( + 'lz4' not in tb.blosc_compressor_list(), 'lz4 required') +class CompressBloscLZ4HCTablesTestCase(BasicTestCase): + title = "CompressLZ4HCTables" + compress = 1 + shuffle = 1 + complib = "blosc:lz4hc" + + +@common.unittest.skipIf(not common.blosc_avail, + 'BLOSC compression library not available') +@common.unittest.skipIf('snappy' not in tb.blosc_compressor_list(), + 'snappy required') +class CompressBloscSnappyTablesTestCase(BasicTestCase): + title = "CompressSnappyTables" + compress = 1 + shuffle = 1 + complib = "blosc:snappy" + + +@common.unittest.skipIf(not common.blosc_avail, + 'BLOSC compression library not available') +@common.unittest.skipIf( + 'zlib' not in tb.blosc_compressor_list(), 'zlib required') +class CompressBloscZlibTablesTestCase(BasicTestCase): + title = "CompressZlibTables" + compress = 1 + shuffle = 1 + complib = "blosc:zlib" + + +@common.unittest.skipIf(not common.blosc_avail, + 'BLOSC compression library not available') +@common.unittest.skipIf( + 'zstd' not in tb.blosc_compressor_list(), 'zstd required') +class CompressBloscZstdTablesTestCase(BasicTestCase): + title = "CompressZstdTables" + compress = 1 + shuffle = 1 + complib = "blosc:zstd" + + +@common.unittest.skipIf(not common.lzo_avail, + 'LZO compression library not available') +class CompressLZOTablesTestCase(BasicTestCase): + title = "CompressLZOTables" + compress = 1 + complib = "lzo" + + +@common.unittest.skipIf(not common.lzo_avail, + 'LZO compression library not available') +class CompressLZOShuffleTablesTestCase(BasicTestCase): + title = "CompressLZOTables" + compress = 1 + shuffle = 1 + complib = "lzo" + + +@common.unittest.skipIf(not common.bzip2_avail, + 'BZIP2 compression library not available') +class CompressBzip2TablesTestCase(BasicTestCase): + title = "CompressBzip2Tables" + compress = 1 + complib = "bzip2" + + +@common.unittest.skipIf(not common.bzip2_avail, + 'BZIP2 compression library not available') +class CompressBzip2ShuffleTablesTestCase(BasicTestCase): + title = "CompressBzip2Tables" + compress = 1 + shuffle = 1 + complib = "bzip2" + + +class CompressZLIBTablesTestCase(BasicTestCase): + title = "CompressOneTables" + compress = 1 + complib = "zlib" + + +class CompressZLIBShuffleTablesTestCase(BasicTestCase): + title = "CompressOneTables" + compress = 1 + shuffle = 1 + complib = "zlib" + + +class Fletcher32TablesTestCase(BasicTestCase): + title = "Fletcher32Tables" + fletcher32 = 1 + shuffle = 0 + complib = "zlib" + + +class AllFiltersTablesTestCase(BasicTestCase): + title = "AllFiltersTables" + compress = 1 + fletcher32 = 1 + shuffle = 1 + complib = "zlib" + + +class CompressTwoTablesTestCase(BasicTestCase): + title = "CompressTwoTables" + compress = 1 + # This checks also unidimensional arrays as columns + record = RecordDescriptionDict + + +class BigTablesTestCase(BasicTestCase): + title = "BigTables" + # 10000 rows takes much more time than we can afford for tests + # reducing to 1000 would be more than enough + # F. Alted 2004-01-19 + # Will be executed only in common.heavy mode + expectedrows = 10_000 + appendrows = 100 + + +class SizeOnDiskInMemoryPropertyTestCase(common.TempFileMixin, + common.PyTablesTestCase): + def setUp(self): + super().setUp() + + # set chunkshape so it divides evenly into array_size, to avoid + # partially filled chunks + self.chunkshape = (1000, ) + self.dtype = np.format_parser(['i4'] * 10, [], []).dtype + # approximate size (in bytes) of non-data portion of hdf5 file + self.hdf_overhead = 6000 + + def create_table(self, complevel): + filters = tb.Filters(complevel=complevel, complib='blosc') + self.table = self.h5file.create_table('/', 'sometable', self.dtype, + filters=filters, + chunkshape=self.chunkshape) + + def test_zero_length(self): + complevel = 0 + self.create_table(complevel) + self.assertEqual(self.table.size_on_disk, 0) + self.assertEqual(self.table.size_in_memory, 0) + + # add 10 chunks of data in one append + def test_no_compression_one_append(self): + complevel = 0 + self.create_table(complevel) + self.table.append([tuple(range(10))] * self.chunkshape[0] * 10) + self.assertEqual(self.table.size_on_disk, 10 * 1000 * 10 * 4) + self.assertEqual(self.table.size_in_memory, 10 * 1000 * 10 * 4) + + # add 10 chunks of data in two appends + def test_no_compression_multiple_appends(self): + complevel = 0 + self.create_table(complevel) + self.table.append([tuple(range(10))] * self.chunkshape[0] * 5) + self.table.append([tuple(range(10))] * self.chunkshape[0] * 5) + self.assertEqual(self.table.size_on_disk, 10 * 1000 * 10 * 4) + self.assertEqual(self.table.size_in_memory, 10 * 1000 * 10 * 4) + + def test_with_compression(self): + complevel = 1 + self.create_table(complevel) + self.table.append([tuple(range(10))] * self.chunkshape[0] * 10) + file_size = Path(self.h5fname).stat().st_size + self.assertTrue( + abs(self.table.size_on_disk - file_size) <= self.hdf_overhead) + self.assertEqual(self.table.size_in_memory, 10 * 1000 * 10 * 4) + self.assertLess(self.table.size_on_disk, self.table.size_in_memory) + + +class NonNestedTableReadTestCase(common.TempFileMixin, + common.PyTablesTestCase): + def setUp(self): + super().setUp() + + self.dtype = np.format_parser(['i4'] * 10, [], []).dtype + self.table = self.h5file.create_table('/', 'table', self.dtype) + self.shape = (100, ) + self.populate_file() + + def populate_file(self): + self.array = np.zeros(self.shape, self.dtype) + for row_num, row in enumerate(self.array): + start = row_num * len(self.array.dtype.names) + for value, col in enumerate(self.array.dtype.names, start): + row[col] = value + self.table.append(self.array) + self.assertEqual(len(self.table), len(self.array)) + + def test_read_all(self): + output = self.table.read() + np.testing.assert_array_equal(output, self.array) + + def test_read_slice1(self): + output = self.table.read(0, 51) + np.testing.assert_array_equal(output, self.array[0:51]) + + def test_read_all_rows_specified_field(self): + output = self.table.read(field='f1') + np.testing.assert_array_equal(output, self.array['f1']) + + def test_read_slice1_specified_field(self): + output = self.table.read(1, 64, field='f1') + np.testing.assert_array_equal(output, self.array['f1'][1:64]) + + def test_out_arg_with_non_numpy_flavor(self): + output = np.empty(self.shape, self.dtype) + self.table.flavor = 'python' + self.assertRaises(TypeError, lambda: self.table.read(out=output)) + try: + self.table.read(out=output) + except TypeError as exc: + self.assertIn("Optional 'out' argument may only be", str(exc)) + + def test_read_all_out_arg(self): + output = np.empty(self.shape, self.dtype) + self.table.read(out=output) + np.testing.assert_array_equal(output, self.array) + + def test_read_slice1_out_arg(self): + output = np.empty((51, ), self.dtype) + self.table.read(0, 51, out=output) + np.testing.assert_array_equal(output, self.array[0:51]) + + def test_read_all_rows_specified_field_out_arg(self): + output = np.empty(self.shape, 'i4') + self.table.read(field='f1', out=output) + np.testing.assert_array_equal(output, self.array['f1']) + + def test_read_slice1_specified_field_out_arg(self): + output = np.empty((63, ), 'i4') + self.table.read(1, 64, field='f1', out=output) + np.testing.assert_array_equal(output, self.array['f1'][1:64]) + + def test_read_all_out_arg_sliced(self): + output = np.empty((200, ), self.dtype) + output['f0'] = np.random.randint(0, 10_000, (200, )) + output_orig = output.copy() + self.table.read(out=output[0:100]) + np.testing.assert_array_equal(output[0:100], self.array) + np.testing.assert_array_equal(output[100:], output_orig[100:]) + + def test_all_fields_non_contiguous_slice_contiguous_buffer(self): + output = np.empty((50, ), self.dtype) + self.table.read(0, 100, 2, out=output) + np.testing.assert_array_equal(output, self.array[0:100:2]) + + def test_specified_field_non_contiguous_slice_contiguous_buffer(self): + output = np.empty((50, ), 'i4') + self.table.read(0, 100, 2, field='f3', out=output) + np.testing.assert_array_equal(output, self.array['f3'][0:100:2]) + + def test_all_fields_non_contiguous_buffer(self): + output = np.empty((100, ), self.dtype) + output_slice = output[0:100:2] + + with self.assertRaisesRegex(ValueError, + 'output array not C contiguous'): + self.table.read(0, 100, 2, field=None, out=output_slice) + + def test_specified_field_non_contiguous_buffer(self): + output = np.empty((100, ), 'i4') + output_slice = output[0:100:2] + self.assertRaises(ValueError, self.table.read, 0, 100, 2, 'f3', + output_slice) + try: + self.table.read(0, 100, 2, field='f3', out=output_slice) + except ValueError as exc: + self.assertEqual('output array not C contiguous', str(exc)) + + def test_all_fields_buffer_too_small(self): + output = np.empty((99, ), self.dtype) + self.assertRaises(ValueError, lambda: self.table.read(out=output)) + try: + self.table.read(out=output) + except ValueError as exc: + self.assertIn('output array size invalid, got', str(exc)) + + def test_specified_field_buffer_too_small(self): + output = np.empty((99, ), 'i4') + self.assertRaises(ValueError, + lambda: self.table.read(field='f5', out=output)) + try: + self.table.read(field='f5', out=output) + except ValueError as exc: + self.assertIn('output array size invalid, got', str(exc)) + + def test_all_fields_buffer_too_large(self): + output = np.empty((101, ), self.dtype) + self.assertRaises(ValueError, lambda: self.table.read(out=output)) + try: + self.table.read(out=output) + except ValueError as exc: + self.assertIn('output array size invalid, got', str(exc)) + + +class TableReadByteorderTestCase(common.TempFileMixin, + common.PyTablesTestCase): + def setUp(self): + super().setUp() + self.system_byteorder = sys.byteorder + self.other_byteorder = { + 'little': 'big', 'big': 'little'}[sys.byteorder] + self.reverse_byteorders = {'little': '<', 'big': '>'} + + def create_table(self, byteorder): + table_dtype_code = self.reverse_byteorders[byteorder] + 'i4' + table_dtype = np.format_parser([table_dtype_code, 'a1'], [], []).dtype + self.table = self.h5file.create_table('/', 'table', table_dtype, + byteorder=byteorder) + input_dtype = np.format_parser(['i4', 'a1'], [], []).dtype + self.input_array = np.zeros((10, ), input_dtype) + self.input_array['f0'] = np.arange(10) + self.input_array['f1'] = b'a' + self.table.append(self.input_array) + + def test_table_system_byteorder_no_out_argument(self): + self.create_table(self.system_byteorder) + output = self.table.read() + self.assertEqual(tb.utils.byteorders[output['f0'].dtype.byteorder], + self.system_byteorder) + np.testing.assert_array_equal(output['f0'], np.arange(10)) + + def test_table_other_byteorder_no_out_argument(self): + self.create_table(self.other_byteorder) + output = self.table.read() + self.assertEqual(tb.utils.byteorders[output['f0'].dtype.byteorder], + self.system_byteorder) + np.testing.assert_array_equal(output['f0'], np.arange(10)) + + def test_table_system_byteorder_out_argument_system_byteorder(self): + self.create_table(self.system_byteorder) + out_dtype_code = self.reverse_byteorders[self.system_byteorder] + 'i4' + out_dtype = np.format_parser([out_dtype_code, 'a1'], [], []).dtype + output = np.empty((10, ), out_dtype) + self.table.read(out=output) + self.assertEqual(tb.utils.byteorders[output['f0'].dtype.byteorder], + self.system_byteorder) + np.testing.assert_array_equal(output['f0'], np.arange(10)) + + def test_table_other_byteorder_out_argument_system_byteorder(self): + self.create_table(self.other_byteorder) + out_dtype_code = self.reverse_byteorders[self.system_byteorder] + 'i4' + out_dtype = np.format_parser([out_dtype_code, 'a1'], [], []).dtype + output = np.empty((10, ), out_dtype) + self.table.read(out=output) + self.assertEqual(tb.utils.byteorders[output['f0'].dtype.byteorder], + self.system_byteorder) + np.testing.assert_array_equal(output['f0'], np.arange(10)) + + def test_table_system_byteorder_out_argument_other_byteorder(self): + self.create_table(self.system_byteorder) + out_dtype_code = self.reverse_byteorders[self.other_byteorder] + 'i4' + out_dtype = np.format_parser([out_dtype_code, 'a1'], [], []).dtype + output = np.empty((10, ), out_dtype) + self.assertRaises(ValueError, lambda: self.table.read(out=output)) + try: + self.table.read(out=output) + except ValueError as exc: + self.assertIn("array must be in system's byteorder", str(exc)) + + def test_table_other_byteorder_out_argument_other_byteorder(self): + self.create_table(self.other_byteorder) + out_dtype_code = self.reverse_byteorders[self.other_byteorder] + 'i4' + out_dtype = np.format_parser([out_dtype_code, 'a1'], [], []).dtype + output = np.empty((10, ), out_dtype) + self.assertRaises(ValueError, lambda: self.table.read(out=output)) + try: + self.table.read(out=output) + except ValueError as exc: + self.assertIn("array must be in system's byteorder", str(exc)) + + +class BasicRangeTestCase(common.TempFileMixin, common.PyTablesTestCase): + # file = "test.h5" + open_mode = "w" + title = "This is the table title" + record = Record + maxshort = 1 << 15 + expectedrows = 100 + compress = 0 + shuffle = 1 + # Default values + nrows = 20 + nrowsinbuf = 3 # Choose a small value for the buffer size + start = 1 + stop = nrows + checkrecarray = 0 + checkgetCol = 0 + + def setUp(self): + super().setUp() + + # Create an instance of an HDF5 Table + self.rootgroup = self.h5file.root + self.populateFile() + self.h5file.close() + + def populateFile(self): + group = self.rootgroup + for j in range(3): + # Create a table + filterprops = tb.Filters(complevel=self.compress, + shuffle=self.shuffle) + table = self.h5file.create_table(group, 'table'+str(j), + self.record, + title=self.title, + filters=filterprops, + expectedrows=self.expectedrows) + + # Get the row object associated with the new table + row = table.row + + # Fill the table + for i in range(self.expectedrows): + row['var1'] = '%04d' % (self.expectedrows - i) + row['var7'] = row['var1'][-1] + row['var2'] = i + row['var3'] = i % self.maxshort + if isinstance(row['var4'], np.ndarray): + row['var4'] = [float(i), float(i * i)] + else: + row['var4'] = float(i) + if isinstance(row['var5'], np.ndarray): + row['var5'] = np.array((float(i),)*4) + else: + row['var5'] = float(i) + + # var6 will be like var3 but byteswaped + row['var6'] = ( + ((row['var3'] >> 8) & 0xff) + ((row['var3'] << 8) & 0xff00) + ) + row.append() + + # Flush the buffer for this table + table.flush() + # Create a new group (descendant of group) + group2 = self.h5file.create_group(group, 'group'+str(j)) + # Iterate over this new group (group2) + group = group2 + + def check_range(self): + # Create an instance of an HDF5 Table + self.h5file = tb.open_file(self.h5fname, "r") + table = self.h5file.get_node("/table0") + + table.nrowsinbuf = self.nrowsinbuf + resrange = slice(self.start, self.stop, self.step).indices(table.nrows) + reslength = len(list(range(*resrange))) + # print "self.checkrecarray = ", self.checkrecarray + # print "self.checkgetCol = ", self.checkgetCol + if self.checkrecarray: + recarray = table.read(self.start, self.stop, self.step) + result = [] + for nrec in range(len(recarray)): + if recarray['var2'][nrec] < self.nrows and 0 < self.step: + result.append(recarray['var2'][nrec]) + elif recarray['var2'][nrec] > self.nrows and 0 > self.step: + result.append(recarray['var2'][nrec]) + elif self.checkgetCol: + column = table.read(self.start, self.stop, self.step, 'var2') + result = [] + for nrec in range(len(column)): + if column[nrec] < self.nrows and 0 < self.step: + result.append(column[nrec]) + elif column[nrec] > self.nrows and 0 > self.step: + result.append(column[nrec]) + else: + if 0 < self.step: + result = [ + rec['var2'] for rec in table.iterrows(self.start, + self.stop, + self.step) + if rec['var2'] < self.nrows + ] + elif 0 > self.step: + result = [ + rec['var2'] for rec in table.iterrows(self.start, + self.stop, + self.step) + if rec['var2'] > self.nrows + ] + + if self.start < 0: + startr = self.expectedrows + self.start + else: + startr = self.start + + if self.stop is None: + if self.checkrecarray or self.checkgetCol: + # data read using the read method + stopr = startr + 1 + else: + # data read using the iterrows method + stopr = self.nrows + elif self.stop < 0: + stopr = self.expectedrows + self.stop + else: + stopr = self.stop + + if self.nrows < stopr: + stopr = self.nrows + + if common.verbose: + print("Nrows in", table._v_pathname, ":", table.nrows) + if reslength: + if self.checkrecarray: + print("Last record *read* in recarray ==>", recarray[-1]) + elif self.checkgetCol: + print("Last value *read* in getCol ==>", column[-1]) + else: + print("Last record *read* in table range ==>", rec) + print("Total number of selected records ==>", len(result)) + print("Selected records:\n", result) + print("Selected records should look like:\n", + list(range(startr, stopr, self.step))) + print("start, stop, step ==>", self.start, self.stop, self.step) + print("startr, stopr, step ==>", startr, stopr, self.step) + + self.assertEqual(result, list(range(startr, stopr, self.step))) + if not (self.checkrecarray or self.checkgetCol): + if startr < stopr and 0 < self.step: + rec = [r for r in table.iterrows(self.start, self.stop, + self.step) + if r['var2'] < self.nrows][-1] + if self.nrows < self.expectedrows: + self.assertEqual( + rec['var2'], + list(range(self.start, self.stop, self.step))[-1]) + else: + self.assertEqual( + rec['var2'], + list(range(startr, stopr, self.step))[-1]) + elif startr > stopr and 0 > self.step: + rec = [r['var2'] for r in table.iterrows(self.start, self.stop, + self.step) + if r['var2'] > self.nrows][0] + if self.nrows < self.expectedrows: + self.assertEqual( + rec, + list(range(self.start, self.stop or -1, self.step))[0]) + else: + self.assertEqual( + rec, + list(range(startr, stopr or -1, self.step))[0]) + + # Close the file + self.h5file.close() + + def test01_range(self): + """Checking ranges in table iterators (case1)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test01_range..." % self.__class__.__name__) + + # Case where step < nrowsinbuf < 2 * step + self.nrows = 21 + self.nrowsinbuf = 3 + self.start = 0 + self.stop = self.expectedrows + self.step = 2 + + self.check_range() + + def test01a_range(self): + """Checking ranges in table iterators (case1)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test01a_range..." % self.__class__.__name__) + + # Case where step < nrowsinbuf < 2 * step + self.nrows = 21 + self.nrowsinbuf = 3 + self.start = self.expectedrows - 1 + self.stop = None + self.step = -2 + + self.check_range() + + def test02_range(self): + """Checking ranges in table iterators (case2)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test02_range..." % self.__class__.__name__) + + # Case where step < nrowsinbuf < 10 * step + self.nrows = 21 + self.nrowsinbuf = 31 + self.start = 11 + self.stop = self.expectedrows + self.step = 3 + + self.check_range() + + def test03_range(self): + """Checking ranges in table iterators (case3)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test03_range..." % self.__class__.__name__) + + # Case where step < nrowsinbuf < 1.1 * step + self.nrows = self.expectedrows + self.nrowsinbuf = 11 # Choose a small value for the buffer size + self.start = 0 + self.stop = self.expectedrows + self.step = 10 + + self.check_range() + + def test04_range(self): + """Checking ranges in table iterators (case4)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test04_range..." % self.__class__.__name__) + + # Case where step == nrowsinbuf + self.nrows = self.expectedrows + self.nrowsinbuf = 11 # Choose a small value for the buffer size + self.start = 1 + self.stop = self.expectedrows + self.step = 11 + + self.check_range() + + def test05_range(self): + """Checking ranges in table iterators (case5)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test05_range..." % self.__class__.__name__) + + # Case where step > 1.1 * nrowsinbuf + self.nrows = 21 + self.nrowsinbuf = 10 # Choose a small value for the buffer size + self.start = 1 + self.stop = self.expectedrows + self.step = 11 + + self.check_range() + + def test06_range(self): + """Checking ranges in table iterators (case6)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test06_range..." % self.__class__.__name__) + + # Case where step > 3 * nrowsinbuf + self.nrows = 3 + self.nrowsinbuf = 3 # Choose a small value for the buffer size + self.start = 2 + self.stop = self.expectedrows + self.step = 10 + + self.check_range() + + def test07_range(self): + """Checking ranges in table iterators (case7)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test07_range..." % self.__class__.__name__) + + # Case where start == stop + self.nrows = 2 + self.nrowsinbuf = 3 # Choose a small value for the buffer size + self.start = self.nrows + self.stop = self.nrows + self.step = 10 + + self.check_range() + + def test08_range(self): + """Checking ranges in table iterators (case8)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test08_range..." % self.__class__.__name__) + + # Case where start > stop + self.nrows = 2 + self.nrowsinbuf = 3 # Choose a small value for the buffer size + self.start = self.nrows + 1 + self.stop = self.nrows + self.step = 1 + + self.check_range() + + def test09_range(self): + """Checking ranges in table iterators (case9)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test09_range..." % self.__class__.__name__) + + # Case where stop = None (last row) + self.nrows = 100 + self.nrowsinbuf = 3 # Choose a small value for the buffer size + self.start = 1 + self.stop = 2 + self.step = 1 + + self.check_range() + + def test10_range(self): + """Checking ranges in table iterators (case10)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test10_range..." % self.__class__.__name__) + + # Case where start < 0 and stop = None (last row) + self.nrows = self.expectedrows + self.nrowsinbuf = 5 # Choose a small value for the buffer size + self.start = -6 + self.startr = self.expectedrows + self.start + self.stop = -5 + self.stopr = self.expectedrows + self.step = 2 + + self.check_range() + + def test10a_range(self): + """Checking ranges in table iterators (case10a)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test10a_range..." % self.__class__.__name__) + + # Case where start < 0 and stop = 0 + self.nrows = self.expectedrows + self.nrowsinbuf = 5 # Choose a small value for the buffer size + self.start = -6 + self.startr = self.expectedrows + self.start + self.stop = 0 + self.stopr = self.expectedrows + self.step = 2 + + self.check_range() + + def test11_range(self): + """Checking ranges in table iterators (case11)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test11_range..." % self.__class__.__name__) + + # Case where start < 0 and stop < 0 + self.nrows = self.expectedrows + self.nrowsinbuf = 5 # Choose a small value for the buffer size + self.start = -6 + self.startr = self.expectedrows + self.start + self.stop = -2 + self.stopr = self.expectedrows + self.stop + self.step = 1 + + self.check_range() + + def test12_range(self): + """Checking ranges in table iterators (case12)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test12_range..." % self.__class__.__name__) + + # Case where start < 0 and stop < 0 and start > stop + self.nrows = self.expectedrows + self.nrowsinbuf = 5 # Choose a small value for the buffer size + self.start = -1 + self.startr = self.expectedrows + self.start + self.stop = -2 + self.stopr = self.expectedrows + self.stop + self.step = 1 + + self.check_range() + + def test13_range(self): + """Checking ranges in table iterators (case13)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test13_range..." % self.__class__.__name__) + + # Case where step < 0 + self.step = -11 + try: + self.check_range() + except ValueError: + if common.verbose: + (type, value, traceback) = sys.exc_info() + print("\nGreat!, the next ValueError was catched!") + print(value) + self.h5file.close() + # else: + # print rec + # self.fail("expected a ValueError") + + # Case where step == 0 + self.step = 0 + try: + self.check_range() + except ValueError: + if common.verbose: + (type, value, traceback) = sys.exc_info() + print("\nGreat!, the next ValueError was catched!") + print(value) + self.h5file.close() + # else: + # print rec + # self.fail("expected a ValueError") + + +class IterRangeTestCase(BasicRangeTestCase): + pass + + +class RecArrayRangeTestCase(BasicRangeTestCase): + checkrecarray = 1 + + +class GetColRangeTestCase(BasicRangeTestCase): + checkgetCol = 1 + + def test01_nonexistentField(self): + """Checking non-existing Field in getCol method """ + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test01_nonexistentField..." % + self.__class__.__name__) + + # Create an instance of an HDF5 Table + self.h5file = tb.open_file(self.h5fname, "r") + self.root = self.h5file.root + table = self.h5file.get_node("/table0") + + with self.assertRaises(KeyError): + # column = table.read(field='non-existent-column') + table.col('non-existent-column') + + +class GetItemTestCase(common.TempFileMixin, common.PyTablesTestCase): + open_mode = "w" + title = "This is the table title" + record = Record + maxshort = 1 << 15 + expectedrows = 100 + compress = 0 + shuffle = 1 + # Default values + nrows = 20 + nrowsinbuf = 3 # Choose a small value for the buffer size + start = 1 + stop = nrows + checkrecarray = 0 + checkgetCol = 0 + + def setUp(self): + super().setUp() + + # Create an instance of an HDF5 Table + self.rootgroup = self.h5file.root + self.populateFile() + self.h5file.close() + + def populateFile(self): + group = self.rootgroup + for j in range(3): + # Create a table + filterprops = tb.Filters(complevel=self.compress, + shuffle=self.shuffle) + table = self.h5file.create_table(group, 'table'+str(j), + self.record, + title=self.title, + filters=filterprops, + expectedrows=self.expectedrows) + # Get the row object associated with the new table + row = table.row + + # Fill the table + for i in range(self.expectedrows): + row['var1'] = '%04d' % (self.expectedrows - i) + row['var7'] = row['var1'][-1] + row['var2'] = i + row['var3'] = i % self.maxshort + if isinstance(row['var4'], np.ndarray): + row['var4'] = [float(i), float(i * i)] + else: + row['var4'] = float(i) + if isinstance(row['var5'], np.ndarray): + row['var5'] = np.array((float(i),)*4) + else: + row['var5'] = float(i) + # var6 will be like var3 but byteswaped + row['var6'] = ((row['var3'] >> 8) & 0xff) + \ + ((row['var3'] << 8) & 0xff00) + row.append() + + # Flush the buffer for this table + table.flush() + # Create a new group (descendant of group) + group2 = self.h5file.create_group(group, 'group'+str(j)) + # Iterate over this new group (group2) + group = group2 + + def test01a_singleItem(self): + """Checking __getitem__ method with single parameter (int)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test01a_singleItem..." % self.__class__.__name__) + + self.h5file = tb.open_file(self.h5fname, "r") + table = self.h5file.root.table0 + result = table[2] + self.assertEqual(result["var2"], 2) + result = table[25] + self.assertEqual(result["var2"], 25) + result = table[self.expectedrows-1] + self.assertEqual(result["var2"], self.expectedrows - 1) + + def test01b_singleItem(self): + """Checking __getitem__ method with single parameter (neg. int)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test01b_singleItem..." % self.__class__.__name__) + + self.h5file = tb.open_file(self.h5fname, "r") + table = self.h5file.root.table0 + result = table[-5] + self.assertEqual(result["var2"], self.expectedrows - 5) + result = table[-1] + self.assertEqual(result["var2"], self.expectedrows - 1) + result = table[-self.expectedrows] + self.assertEqual(result["var2"], 0) + + def test01c_singleItem(self): + """Checking __getitem__ method with single parameter (long)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test01c_singleItem..." % self.__class__.__name__) + + self.h5file = tb.open_file(self.h5fname, "r") + table = self.h5file.root.table0 + result = table[2] + self.assertEqual(result["var2"], 2) + result = table[25] + self.assertEqual(result["var2"], 25) + result = table[self.expectedrows-1] + self.assertEqual(result["var2"], self.expectedrows - 1) + + def test01d_singleItem(self): + """Checking __getitem__ method with single parameter (neg. long)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test01d_singleItem..." % self.__class__.__name__) + + self.h5file = tb.open_file(self.h5fname, "r") + table = self.h5file.root.table0 + result = table[-5] + self.assertEqual(result["var2"], self.expectedrows - 5) + result = table[-1] + self.assertEqual(result["var2"], self.expectedrows - 1) + result = table[-self.expectedrows] + self.assertEqual(result["var2"], 0) + + def test01e_singleItem(self): + """Checking __getitem__ method with single parameter (rank-0 ints)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test01e_singleItem..." % self.__class__.__name__) + + self.h5file = tb.open_file(self.h5fname, "r") + table = self.h5file.root.table0 + result = table[np.array(2)] + self.assertEqual(result["var2"], 2) + result = table[np.array(25)] + self.assertEqual(result["var2"], 25) + result = table[np.array(self.expectedrows-1)] + self.assertEqual(result["var2"], self.expectedrows - 1) + + def test01f_singleItem(self): + """Checking __getitem__ method with single parameter (np.uint64)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test01f_singleItem..." % self.__class__.__name__) + + self.h5file = tb.open_file(self.h5fname, "r") + table = self.h5file.root.table0 + + result = table[np.uint64(2)] + self.assertEqual(result["var2"], 2) + + def test02_twoItems(self): + """Checking __getitem__ method with start, stop parameters.""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test02_twoItem..." % self.__class__.__name__) + + self.h5file = tb.open_file(self.h5fname, "r") + table = self.h5file.root.table0 + result = table[2:6] + self.assertEqual(result["var2"].tolist(), list(range(2, 6))) + result = table[2:-6] + self.assertEqual(result["var2"].tolist(), list(range( + 2, self.expectedrows-6))) + result = table[2:] + self.assertEqual(result["var2"].tolist(), + list(range(2, self.expectedrows))) + result = table[-2:] + self.assertEqual(result["var2"].tolist(), + list(range(self.expectedrows-2, self.expectedrows))) + + def test03_threeItems(self): + """Checking __getitem__ method with start, stop, step parameters.""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test03_threeItem..." % self.__class__.__name__) + + self.h5file = tb.open_file(self.h5fname, "r") + table = self.h5file.root.table0 + result = table[2:6:3] + self.assertEqual(result["var2"].tolist(), list(range(2, 6, 3))) + result = table[2::3] + self.assertEqual(result["var2"].tolist(), list(range( + 2, self.expectedrows, 3))) + result = table[:6:2] + self.assertEqual(result["var2"].tolist(), list(range(0, 6, 2))) + result = table[::] + self.assertEqual(result["var2"].tolist(), list(range( + 0, self.expectedrows, 1))) + + def test04_negativeStep(self): + """Checking __getitem__ method with negative step parameter.""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test04_negativeStep..." % + self.__class__.__name__) + + self.h5file = tb.open_file(self.h5fname, "r") + table = self.h5file.root.table0 + with self.assertRaises(ValueError): + table[2:3:-3] + + def test06a_singleItemCol(self): + """Checking __getitem__ method in Col with single parameter.""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test06a_singleItemCol..." % + self.__class__.__name__) + + self.h5file = tb.open_file(self.h5fname, "r") + table = self.h5file.root.table0 + colvar2 = table.cols.var2 + self.assertEqual(colvar2[2], 2) + self.assertEqual(colvar2[25], 25) + self.assertEqual(colvar2[self.expectedrows-1], self.expectedrows - 1) + + def test06b_singleItemCol(self): + """Checking __getitem__ method in Col with single parameter + (negative)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test06b_singleItem..." % self.__class__.__name__) + + self.h5file = tb.open_file(self.h5fname, "r") + table = self.h5file.root.table0 + colvar2 = table.cols.var2 + self.assertEqual(colvar2[-5], self.expectedrows - 5) + self.assertEqual(colvar2[-1], self.expectedrows - 1) + self.assertEqual(colvar2[-self.expectedrows], 0) + + def test07_twoItemsCol(self): + """Checking __getitem__ method in Col with start, stop parameters.""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test07_twoItemCol..." % self.__class__.__name__) + + self.h5file = tb.open_file(self.h5fname, "r") + table = self.h5file.root.table0 + colvar2 = table.cols.var2 + self.assertEqual(colvar2[2:6].tolist(), list(range(2, 6))) + self.assertEqual(colvar2[2:-6].tolist(), + list(range(2, self.expectedrows - 6))) + self.assertEqual(colvar2[2:].tolist(), + list(range(2, self.expectedrows))) + self.assertEqual(colvar2[-2:].tolist(), + list(range(self.expectedrows - 2, self.expectedrows))) + + def test08_threeItemsCol(self): + """Checking __getitem__ method in Col with start, stop, step + parameters.""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test08_threeItemCol..." % + self.__class__.__name__) + + self.h5file = tb.open_file(self.h5fname, "r") + table = self.h5file.root.table0 + colvar2 = table.cols.var2 + self.assertEqual(colvar2[2:6:3].tolist(), list(range(2, 6, 3))) + self.assertEqual(colvar2[2::3].tolist(), list(range( + 2, self.expectedrows, 3))) + self.assertEqual(colvar2[:6:2].tolist(), list(range(0, 6, 2))) + self.assertEqual(colvar2[::].tolist(), + list(range(0, self.expectedrows, 1))) + + def test09_negativeStep(self): + """Checking __getitem__ method in Col with negative step parameter.""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test09_negativeStep..." % + self.__class__.__name__) + + self.h5file = tb.open_file(self.h5fname, "r") + table = self.h5file.root.table0 + colvar2 = table.cols.var2 + with self.assertRaises(ValueError): + colvar2[2:3:-3] + + def test10_list_integers(self): + """Checking accessing Table with a list of integers.""" + + self.h5file = tb.open_file(self.h5fname, "r") + table = self.h5file.root.table0 + idx = list(range(10, 70, 11)) + + result = table[idx] + self.assertEqual(result["var2"].tolist(), idx) + + result = table.read_coordinates(idx) + self.assertEqual(result["var2"].tolist(), idx) + + def test11_list_booleans(self): + """Checking accessing Table with a list of boolean values.""" + + self.h5file = tb.open_file(self.h5fname, "r") + table = self.h5file.root.table0 + idx = list(range(10, 70, 11)) + + selection = [n in idx for n in range(self.expectedrows)] + + result = table[selection] + self.assertEqual(result["var2"].tolist(), idx) + + result = table.read_coordinates(selection) + self.assertEqual(result["var2"].tolist(), idx) + + +class Rec(tb.IsDescription): + col1 = tb.IntCol(pos=1) + col2 = tb.StringCol(itemsize=3, pos=2) + col3 = tb.FloatCol(pos=3) + + +class SetItemTestCase(common.TempFileMixin, common.PyTablesTestCase): + def test01(self): + """Checking modifying one table row with __setitem__""" + + # Create a new table: + table = self.h5file.create_table(self.h5file.root, 'recarray', Rec) + table.nrowsinbuf = self.buffersize # set buffer value + + # append new rows + r = np.rec.array( + [(456, b'dbe', 1.2), (2, b'ded', 1.3)], formats="i4,a3,f8") + table.append(r) + table.append([(457, b'db1', 1.2), (5, b'de1', 1.3)]) + + # Modify just one existing row + table[2] = (456, 'db2', 1.2) + # Create the modified recarray + r1 = np.rec.array([(456, b'dbe', 1.2), (2, b'ded', 1.3), + (456, b'db2', 1.2), (5, b'de1', 1.3)], + formats="i4,a3,f8", + names="col1,col2,col3") + # Read the modified table + if self.reopen: + self._reopen() + table = self.h5file.root.recarray + table.nrowsinbuf = self.buffersize # set buffer value + r2 = table.read() + if common.verbose: + print("Original table-->", repr(r2)) + print("Should look like-->", repr(r1)) + self.assertEqual(r1.tobytes(), r2.tobytes()) + self.assertEqual(table.nrows, 4) + + def test01b(self): + """Checking modifying one table row with __setitem__ (long index)""" + + # Create a new table: + table = self.h5file.create_table(self.h5file.root, 'recarray', Rec) + table.nrowsinbuf = self.buffersize # set buffer value + + # append new rows + r = np.rec.array( + [(456, b'dbe', 1.2), (2, b'ded', 1.3)], formats="i4,a3,f8") + table.append(r) + table.append([(457, b'db1', 1.2), (5, b'de1', 1.3)]) + + # Modify just one existing row + table[2] = (456, 'db2', 1.2) + # Create the modified recarray + r1 = np.rec.array([(456, b'dbe', 1.2), (2, b'ded', 1.3), + (456, b'db2', 1.2), (5, b'de1', 1.3)], + formats="i4,a3,f8", + names="col1,col2,col3") + # Read the modified table + if self.reopen: + self._reopen() + table = self.h5file.root.recarray + table.nrowsinbuf = self.buffersize # set buffer value + r2 = table.read() + if common.verbose: + print("Original table-->", repr(r2)) + print("Should look like-->", repr(r1)) + self.assertEqual(r1.tobytes(), r2.tobytes()) + self.assertEqual(table.nrows, 4) + + def test02(self): + """Modifying one row, with a step (__setitem__)""" + + # Create a new table: + table = self.h5file.create_table(self.h5file.root, 'recarray', Rec) + table.nrowsinbuf = self.buffersize # set buffer value + + # append new rows + r = np.rec.array( + [(456, b'dbe', 1.2), (2, b'ded', 1.3)], formats="i4,a3,f8") + table.append(r) + table.append([(457, b'db1', 1.2), (5, b'de1', 1.3)]) + + # Modify two existing rows + rows = np.rec.array([(457, b'db1', 1.2)], formats="i4,a3,f8") + table[1:3:2] = rows + # Create the modified recarray + r1 = np.rec.array([(456, b'dbe', 1.2), (457, b'db1', 1.2), + (457, b'db1', 1.2), (5, b'de1', 1.3)], + formats="i4,a3,f8", + names="col1,col2,col3") + # Read the modified table + if self.reopen: + self._reopen() + table = self.h5file.root.recarray + table.nrowsinbuf = self.buffersize # set buffer value + r2 = table.read() + if common.verbose: + print("Original table-->", repr(r2)) + print("Should look like-->", repr(r1)) + self.assertEqual(r1.tobytes(), r2.tobytes()) + self.assertEqual(table.nrows, 4) + + def test03(self): + """Checking modifying several rows at once (__setitem__)""" + + # Create a new table: + table = self.h5file.create_table(self.h5file.root, 'recarray', Rec) + table.nrowsinbuf = self.buffersize # set buffer value + + # append new rows + r = np.rec.array( + [(456, b'dbe', 1.2), (2, b'ded', 1.3)], formats="i4,a3,f8") + table.append(r) + table.append([(457, b'db1', 1.2), (5, b'de1', 1.3)]) + + # Modify two existing rows + rows = np.rec.array([(457, b'db1', 1.2), (5, b'de1', 1.3)], + formats="i4,a3,f8") + # table.modify_rows(start=1, rows=rows) + table[1:3] = rows + # Create the modified recarray + r1 = np.rec.array([(456, b'dbe', 1.2), (457, b'db1', 1.2), + (5, b'de1', 1.3), (5, b'de1', 1.3)], + formats="i4,a3,f8", + names="col1,col2,col3") + # Read the modified table + if self.reopen: + self._reopen() + table = self.h5file.root.recarray + table.nrowsinbuf = self.buffersize # set buffer value + r2 = table.read() + if common.verbose: + print("Original table-->", repr(r2)) + print("Should look like-->", repr(r1)) + self.assertEqual(r1.tobytes(), r2.tobytes()) + self.assertEqual(table.nrows, 4) + + def test04(self): + """Modifying several rows at once, with a step (__setitem__)""" + + # Create a new table: + table = self.h5file.create_table(self.h5file.root, 'recarray', Rec) + table.nrowsinbuf = self.buffersize # set buffer value + + # append new rows + r = np.rec.array( + [(456, b'dbe', 1.2), (2, b'ded', 1.3)], formats="i4,a3,f8") + table.append(r) + table.append([(457, b'db1', 1.2), (5, b'de1', 1.3)]) + + # Modify two existing rows + rows = np.rec.array([(457, b'db1', 1.2), (6, b'de2', 1.3)], + formats="i4,a3,f8") + # table[1:4:2] = rows + table[1::2] = rows + # Create the modified recarray + r1 = np.rec.array([(456, b'dbe', 1.2), (457, b'db1', 1.2), + (457, b'db1', 1.2), (6, b'de2', 1.3)], + formats="i4,a3,f8", + names="col1,col2,col3") + # Read the modified table + if self.reopen: + self._reopen() + table = self.h5file.root.recarray + table.nrowsinbuf = self.buffersize # set buffer value + r2 = table.read() + if common.verbose: + print("Original table-->", repr(r2)) + print("Should look like-->", repr(r1)) + self.assertEqual(r1.tobytes(), r2.tobytes()) + self.assertEqual(table.nrows, 4) + + def test05(self): + """Checking modifying one column (single element, __setitem__)""" + + # Create a new table: + table = self.h5file.create_table(self.h5file.root, 'recarray', Rec) + table.nrowsinbuf = self.buffersize # set buffer value + + # append new rows + r = np.rec.array( + [(456, b'dbe', 1.2), (2, b'ded', 1.3)], formats="i4,a3,f8") + table.append(r) + table.append([(457, b'db1', 1.2), (5, b'de1', 1.3)]) + + # Modify just one existing column + table.cols.col1[1] = -1 + # Create the modified recarray + r1 = np.rec.array([(456, b'dbe', 1.2), (-1, b'ded', 1.3), + (457, b'db1', 1.2), (5, b'de1', 1.3)], + formats="i4,a3,f8", + names="col1,col2,col3") + # Read the modified table + if self.reopen: + self._reopen() + table = self.h5file.root.recarray + table.nrowsinbuf = self.buffersize # set buffer value + r2 = table.read() + if common.verbose: + print("Original table-->", repr(r2)) + print("Should look like-->", repr(r1)) + self.assertEqual(r1.tobytes(), r2.tobytes()) + self.assertEqual(table.nrows, 4) + + def test06a(self): + """Checking modifying one column (several elements, __setitem__)""" + + # Create a new table: + table = self.h5file.create_table(self.h5file.root, 'recarray', Rec) + table.nrowsinbuf = self.buffersize # set buffer value + + # append new rows + r = np.rec.array( + [(456, b'dbe', 1.2), (2, b'ded', 1.3)], formats="i4,a3,f8") + table.append(r) + table.append([(457, b'db1', 1.2), (5, b'de1', 1.3)]) + + # Modify just one existing column + table.cols.col1[1:4] = [2, 3, 4] + # Create the modified recarray + r1 = np.rec.array([(456, b'dbe', 1.2), (2, b'ded', 1.3), + (3, b'db1', 1.2), (4, b'de1', 1.3)], + formats="i4,a3,f8", + names="col1,col2,col3") + # Read the modified table + if self.reopen: + self._reopen() + table = self.h5file.root.recarray + table.nrowsinbuf = self.buffersize # set buffer value + r2 = table.read() + if common.verbose: + print("Original table-->", repr(r2)) + print("Should look like-->", repr(r1)) + self.assertEqual(r1.tobytes(), r2.tobytes()) + self.assertEqual(table.nrows, 4) + + def test06b(self): + """Checking modifying one column (iterator, __setitem__)""" + + # Create a new table: + table = self.h5file.create_table(self.h5file.root, 'recarray', Rec) + table.nrowsinbuf = self.buffersize # set buffer value + + # append new rows + r = np.rec.array( + [(456, b'dbe', 1.2), (2, b'ded', 1.3)], formats="i4,a3,f8") + table.append(r) + table.append([(457, b'db1', 1.2), (5, b'de1', 1.3)]) + + # Modify just one existing column + with self.assertRaises(NotImplementedError): + for row in table.iterrows(): + row['col1'] = row.nrow + 1 + row.append() + table.flush() + +# # Create the modified recarray +# r1=np.rec.array([[1,b'dbe',1.2],[2,b'ded',1.3], +# [3,b'db1',1.2],[4,b'de1',1.3]], +# formats="i4,a3,f8", +# names = "col1,col2,col3") +# # Read the modified table +# if self.reopen: +# self.fileh.close() +# self.fileh = tables.open_file(self.file, "r") +# table = self.fileh.root.recarray +# table.nrowsinbuf = self.buffersize # set buffer value +# r2 = table.read() +# if common.verbose: +# print "Original table-->", repr(r2) +# print "Should look like-->", repr(r1) +# self.assertEqual(r1.tobytes(), r2.tobytes()) +# self.assertEqual(table.nrows, 4) + + def test07(self): + """Modifying one column (several elements, __setitem__, step)""" + + # Create a new table: + table = self.h5file.create_table(self.h5file.root, 'recarray', Rec) + table.nrowsinbuf = self.buffersize # set buffer value + + # append new rows + r = np.rec.array( + [(456, b'dbe', 1.2), (1, b'ded', 1.3)], formats="i4,a3,f8") + table.append(r) + table.append([(457, b'db1', 1.2), (5, b'de1', 1.3)]) + # Modify just one existing column + table.cols.col1[1:4:2] = [2, 3] + # Create the modified recarray + r1 = np.rec.array([(456, b'dbe', 1.2), (2, b'ded', 1.3), + (457, b'db1', 1.2), (3, b'de1', 1.3)], + formats="i4,a3,f8", + names="col1,col2,col3") + # Read the modified table + if self.reopen: + self._reopen() + table = self.h5file.root.recarray + table.nrowsinbuf = self.buffersize # set buffer value + r2 = table.read() + if common.verbose: + print("Original table-->", repr(r2)) + print("Should look like-->", repr(r1)) + self.assertEqual(r1.tobytes(), r2.tobytes()) + self.assertEqual(table.nrows, 4) + + def test08(self): + """Modifying one column (one element, __setitem__, step)""" + + # Create a new table: + table = self.h5file.create_table(self.h5file.root, 'recarray', Rec) + table.nrowsinbuf = self.buffersize # set buffer value + + # append new rows + r = np.rec.array( + [(456, b'dbe', 1.2), (2, b'ded', 1.3)], formats="i4,a3,f8") + table.append(r) + table.append([(457, b'db1', 1.2), (5, b'de1', 1.3)]) + + # Modify just one existing column + table.cols.col1[1:4:3] = [2] + # Create the modified recarray + r1 = np.rec.array([(456, b'dbe', 1.2), (2, b'ded', 1.3), + (457, b'db1', 1.2), (5, b'de1', 1.3)], + formats="i4,a3,f8", + names="col1,col2,col3") + # Read the modified table + if self.reopen: + self._reopen() + table = self.h5file.root.recarray + table.nrowsinbuf = self.buffersize # set buffer value + r2 = table.read() + if common.verbose: + print("Original table-->", repr(r2)) + print("Should look like-->", repr(r1)) + self.assertEqual(r1.tobytes(), r2.tobytes()) + self.assertEqual(table.nrows, 4) + + def test09(self): + """Modifying beyond the table extend (__setitem__, step)""" + + # Create a new table: + table = self.h5file.create_table(self.h5file.root, 'recarray', Rec) + table.nrowsinbuf = self.buffersize # set buffer value + + # append new rows + r = np.rec.array( + [(456, b'dbe', 1.2), (2, b'ded', 1.3)], formats="i4,a3,f8") + table.append(r) + table.append([(457, b'db1', 1.2), (5, b'de1', 1.3)]) + + # Try to modify beyond the extend + # This will silently exclude the non-fitting rows + rows = np.rec.array( + [(457, b'db1', 1.2), (6, b'de2', 1.3)], formats="i4,a3,f8") + table[1::2] = rows + # How it should look like + r1 = np.rec.array([(456, b'dbe', 1.2), (457, b'db1', 1.2), + (457, b'db1', 1.2), (6, b'de2', 1.3)], + formats="i4,a3,f8") + + # Read the modified table + if self.reopen: + self._reopen() + table = self.h5file.root.recarray + table.nrowsinbuf = self.buffersize # set buffer value + r2 = table.read() + if common.verbose: + print("Original table-->", repr(r2)) + print("Should look like-->", repr(r1)) + self.assertEqual(r1.tobytes(), r2.tobytes()) + self.assertEqual(table.nrows, 4) + + +class SetItemTestCase1(SetItemTestCase): + reopen = 0 + buffersize = 1 + + +class SetItemTestCase2(SetItemTestCase): + reopen = 1 + buffersize = 2 + + +class SetItemTestCase3(SetItemTestCase): + reopen = 0 + buffersize = 1000 + + +class SetItemTestCase4(SetItemTestCase): + reopen = 1 + buffersize = 1000 + + +class UpdateRowTestCase(common.TempFileMixin, common.PyTablesTestCase): + def test01(self): + """Checking modifying one table row with Row.update""" + + # Create a new table: + table = self.h5file.create_table(self.h5file.root, 'recarray', Rec) + table.nrowsinbuf = self.buffersize # set buffer value + + # append new rows + r = np.rec.array( + [(456, b'dbe', 1.2), (2, b'ded', 1.3)], formats="i4,a3,f8") + table.append(r) + table.append([(457, b'db1', 1.2), (5, b'de1', 1.3)]) + + # Modify just one existing row + for row in table.iterrows(2, 3): + (row['col1'], row['col2'], row['col3']) = (456, 'db2', 1.2) + row.update() + # Create the modified recarray + r1 = np.rec.array([(456, b'dbe', 1.2), (2, b'ded', 1.3), + (456, b'db2', 1.2), (5, b'de1', 1.3)], + formats="i4,a3,f8", + names="col1,col2,col3") + # Read the modified table + if self.reopen: + self._reopen() + table = self.h5file.root.recarray + table.nrowsinbuf = self.buffersize # set buffer value + r2 = table.read() + if common.verbose: + print("Original table-->", repr(r2)) + print("Should look like-->", repr(r1)) + self.assertEqual(r1.tobytes(), r2.tobytes()) + self.assertEqual(table.nrows, 4) + + def test02(self): + """Modifying one row, with a step (Row.update)""" + + # Create a new table: + table = self.h5file.create_table(self.h5file.root, 'recarray', Rec) + table.nrowsinbuf = self.buffersize # set buffer value + + # append new rows + r = np.rec.array( + [(456, b'dbe', 1.2), (2, b'ded', 1.3)], formats="i4,a3,f8") + table.append(r) + table.append([(457, b'db1', 1.2), (5, b'de1', 1.3)]) + + # Modify two existing rows + for row in table.iterrows(1, 3, 2): + if row.nrow == 1: + (row['col1'], row['col2'], row['col3']) = (457, 'db1', 1.2) + elif row.nrow == 3: + (row['col1'], row['col2'], row['col3']) = (6, 'de2', 1.3) + row.update() + # Create the modified recarray + r1 = np.rec.array([(456, b'dbe', 1.2), (457, b'db1', 1.2), + (457, b'db1', 1.2), (5, b'de1', 1.3)], + formats="i4,a3,f8", + names="col1,col2,col3") + # Read the modified table + if self.reopen: + self._reopen() + table = self.h5file.root.recarray + table.nrowsinbuf = self.buffersize # set buffer value + r2 = table.read() + if common.verbose: + print("Original table-->", repr(r2)) + print("Should look like-->", repr(r1)) + self.assertEqual(r1.tobytes(), r2.tobytes()) + self.assertEqual(table.nrows, 4) + + def test03(self): + """Checking modifying several rows at once (Row.update)""" + + # Create a new table: + table = self.h5file.create_table(self.h5file.root, 'recarray', Rec) + table.nrowsinbuf = self.buffersize # set buffer value + + # append new rows + r = np.rec.array( + [(456, b'dbe', 1.2), (2, b'ded', 1.3)], formats="i4,a3,f8") + table.append(r) + table.append([(457, b'db1', 1.2), (5, b'de1', 1.3)]) + + # Modify two existing rows + for row in table.iterrows(1, 3): + if row.nrow == 1: + (row['col1'], row['col2'], row['col3']) = (457, 'db1', 1.2) + elif row.nrow == 2: + (row['col1'], row['col2'], row['col3']) = (5, 'de1', 1.3) + row.update() + # Create the modified recarray + r1 = np.rec.array([(456, b'dbe', 1.2), (457, b'db1', 1.2), + (5, b'de1', 1.3), (5, b'de1', 1.3)], + formats="i4,a3,f8", + names="col1,col2,col3") + # Read the modified table + if self.reopen: + self._reopen() + table = self.h5file.root.recarray + table.nrowsinbuf = self.buffersize # set buffer value + r2 = table.read() + if common.verbose: + print("Original table-->", repr(r2)) + print("Should look like-->", repr(r1)) + self.assertEqual(r1.tobytes(), r2.tobytes()) + self.assertEqual(table.nrows, 4) + + def test04(self): + """Modifying several rows at once, with a step (Row.update)""" + + # Create a new table: + table = self.h5file.create_table(self.h5file.root, 'recarray', Rec) + table.nrowsinbuf = self.buffersize # set buffer value + + # append new rows + r = np.rec.array( + [(456, b'dbe', 1.2), (2, b'ded', 1.3)], formats="i4,a3,f8") + table.append(r) + table.append([(457, b'db1', 1.2), (5, b'de1', 1.3)]) + + # Modify two existing rows + for row in table.iterrows(1, stop=4, step=2): + if row.nrow == 1: + (row['col1'], row['col2'], row['col3']) = (457, 'db1', 1.2) + elif row.nrow == 3: + (row['col1'], row['col2'], row['col3']) = (6, 'de2', 1.3) + row.update() + # Create the modified recarray + r1 = np.rec.array([(456, b'dbe', 1.2), (457, b'db1', 1.2), + (457, b'db1', 1.2), (6, b'de2', 1.3)], + formats="i4,a3,f8", + names="col1,col2,col3") + # Read the modified table + if self.reopen: + self._reopen() + table = self.h5file.root.recarray + table.nrowsinbuf = self.buffersize # set buffer value + r2 = table.read() + if common.verbose: + print("Original table-->", repr(r2)) + print("Should look like-->", repr(r1)) + self.assertEqual(r1.tobytes(), r2.tobytes()) + self.assertEqual(table.nrows, 4) + + def test05(self): + """Checking modifying one column (single element, Row.update)""" + + # Create a new table: + table = self.h5file.create_table(self.h5file.root, 'recarray', Rec) + table.nrowsinbuf = self.buffersize # set buffer value + + # append new rows + r = np.rec.array( + [(456, b'dbe', 1.2), (2, b'ded', 1.3)], formats="i4,a3,f8") + table.append(r) + table.append([(457, b'db1', 1.2), (5, b'de1', 1.3)]) + + # Modify just one existing column + for row in table.iterrows(1, 2): + row['col1'] = -1 + row.update() + # Create the modified recarray + r1 = np.rec.array([(456, b'dbe', 1.2), (-1, b'ded', 1.3), + (457, b'db1', 1.2), (5, b'de1', 1.3)], + formats="i4,a3,f8", + names="col1,col2,col3") + # Read the modified table + if self.reopen: + self._reopen() + table = self.h5file.root.recarray + table.nrowsinbuf = self.buffersize # set buffer value + r2 = table.read() + if common.verbose: + print("Original table-->", repr(r2)) + print("Should look like-->", repr(r1)) + self.assertEqual(r1.tobytes(), r2.tobytes()) + self.assertEqual(table.nrows, 4) + + def test06(self): + """Checking modifying one column (several elements, Row.update)""" + + # Create a new table: + table = self.h5file.create_table(self.h5file.root, 'recarray', Rec) + table.nrowsinbuf = self.buffersize # set buffer value + + # append new rows + r = np.rec.array( + [(456, b'dbe', 1.2), (2, b'ded', 1.3)], formats="i4,a3,f8") + table.append(r) + table.append([(457, b'db1', 1.2), (5, b'de1', 1.3)]) + + # Modify just one existing column + for row in table.iterrows(1, 4): + row['col1'] = row.nrow + 1 + row.update() + # Create the modified recarray + r1 = np.rec.array([(456, b'dbe', 1.2), (2, b'ded', 1.3), + (3, b'db1', 1.2), (4, b'de1', 1.3)], + formats="i4,a3,f8", + names="col1,col2,col3") + # Read the modified table + if self.reopen: + self._reopen() + table = self.h5file.root.recarray + table.nrowsinbuf = self.buffersize # set buffer value + r2 = table.read() + if common.verbose: + print("Original table-->", repr(r2)) + print("Should look like-->", repr(r1)) + self.assertEqual(r1.tobytes(), r2.tobytes()) + self.assertEqual(table.nrows, 4) + + def test07(self): + """Modifying values from a selection""" + + # Create a new table: + table = self.h5file.create_table(self.h5file.root, 'recarray', Rec) + table.nrowsinbuf = self.buffersize # set buffer value + + # append new rows + r = np.rec.array( + [(456, b'dbe', 1.2), (1, b'ded', 1.3)], formats="i4,a3,f8") + table.append(r) + table.append([(457, b'db1', 1.2), (5, b'de1', 1.3)]) + # Modify just rows with col1 < 456 + for row in table.where('col1 < 456'): + row['col1'] = 2 + row['col2'] = 'ada' + row.update() + # Create the modified recarray + r1 = np.rec.array([(456, b'dbe', 1.2), (2, b'ada', 1.3), + (457, b'db1', 1.2), (2, b'ada', 1.3)], + formats="i4,a3,f8", + names="col1,col2,col3") + # Read the modified table + if self.reopen: + self._reopen() + table = self.h5file.root.recarray + table.nrowsinbuf = self.buffersize # set buffer value + r2 = table.read() + if common.verbose: + print("Original table-->", repr(r2)) + print("Should look like-->", repr(r1)) + self.assertEqual(r1.tobytes(), r2.tobytes()) + self.assertEqual(table.nrows, 4) + + def test08(self): + """Modifying a large table (Row.update)""" + + # Create a new table: + table = self.h5file.create_table(self.h5file.root, 'recarray', Rec) + table.nrowsinbuf = self.buffersize # set buffer value + + nrows = 100 + # append new rows + row = table.row + for i in range(nrows): + row['col1'] = i-1 + row['col2'] = 'a'+str(i-1) + row['col3'] = -1.0 + row.append() + table.flush() + + # Modify all the rows + for row in table: + row['col1'] = row.nrow + row['col2'] = 'b'+str(row.nrow) + row['col3'] = 0.0 + row.update() + + # Create the modified recarray + r1 = np.rec.array( + None, shape=nrows, formats="i4,a3,f8", names="col1,col2,col3") + for i in range(nrows): + r1['col1'][i] = i + r1['col2'][i] = 'b'+str(i) + r1['col3'][i] = 0.0 + # Read the modified table + if self.reopen: + self._reopen() + table = self.h5file.root.recarray + table.nrowsinbuf = self.buffersize # set buffer value + r2 = table.read() + if common.verbose: + print("Original table-->", repr(r2)) + print("Should look like-->", repr(r1)) + self.assertEqual(r1.tobytes(), r2.tobytes()) + self.assertEqual(table.nrows, nrows) + + def test08b(self): + """Setting values on a large table without calling Row.update""" + + # Create a new table: + table = self.h5file.create_table(self.h5file.root, 'recarray', Rec) + table.nrowsinbuf = self.buffersize # set buffer value + + nrows = 100 + # append new rows + row = table.row + for i in range(nrows): + row['col1'] = i-1 + row['col2'] = 'a'+str(i-1) + row['col3'] = -1.0 + row.append() + table.flush() + + # Modify all the rows (actually don't) + for row in table: + row['col1'] = row.nrow + row['col2'] = 'b'+str(row.nrow) + row['col3'] = 0.0 + # row.update() + + # Create the modified recarray + r1 = np.rec.array( + None, shape=nrows, formats="i4,a3,f8", names="col1,col2,col3") + for i in range(nrows): + r1['col1'][i] = i-1 + r1['col2'][i] = 'a'+str(i-1) + r1['col3'][i] = -1.0 + # Read the modified table + if self.reopen: + self._reopen() + table = self.h5file.root.recarray + table.nrowsinbuf = self.buffersize # set buffer value + r2 = table.read() + if common.verbose: + print("Original table-->", repr(r2)) + print("Should look like-->", repr(r1)) + self.assertEqual(r1.tobytes(), r2.tobytes()) + self.assertEqual(table.nrows, nrows) + + def test09(self): + """Modifying selected values on a large table""" + + # Create a new table: + table = self.h5file.create_table(self.h5file.root, 'recarray', Rec) + table.nrowsinbuf = self.buffersize # set buffer value + + nrows = 100 + # append new rows + row = table.row + for i in range(nrows): + row['col1'] = i-1 + row['col2'] = 'a'+str(i-1) + row['col3'] = -1.0 + row.append() + table.flush() + + # Modify selected rows + for row in table.where('col1 > nrows-3'): + row['col1'] = row.nrow + row['col2'] = 'b'+str(row.nrow) + row['col3'] = 0.0 + row.update() + + # Create the modified recarray + r1 = np.rec.array( + None, shape=nrows, formats="i4,a3,f8", names="col1,col2,col3") + for i in range(nrows): + r1['col1'][i] = i-1 + r1['col2'][i] = 'a'+str(i-1) + r1['col3'][i] = -1.0 + # modify just the last line + r1['col1'][i] = i + r1['col2'][i] = 'b'+str(i) + r1['col3'][i] = 0.0 + + # Read the modified table + if self.reopen: + self._reopen() + table = self.h5file.root.recarray + table.nrowsinbuf = self.buffersize # set buffer value + r2 = table.read() + if common.verbose: + print("Original table-->", repr(r2)) + print("Should look like-->", repr(r1)) + self.assertEqual(r1.tobytes(), r2.tobytes()) + self.assertEqual(table.nrows, nrows) + + def test09b(self): + """Modifying selected values on a large table (alternate values)""" + + # Create a new table: + table = self.h5file.create_table(self.h5file.root, 'recarray', Rec) + table.nrowsinbuf = self.buffersize # set buffer value + + nrows = 100 + # append new rows + row = table.row + for i in range(nrows): + row['col1'] = i-1 + row['col2'] = 'a'+str(i-1) + row['col3'] = -1.0 + row.append() + table.flush() + + # Modify selected rows + for row in table.iterrows(step=10): + row['col1'] = row.nrow + row['col2'] = 'b'+str(row.nrow) + row['col3'] = 0.0 + row.update() + + # Create the modified recarray + r1 = np.rec.array( + None, shape=nrows, formats="i4,a3,f8", names="col1,col2,col3") + for i in range(nrows): + if i % 10 > 0: + r1['col1'][i] = i-1 + r1['col2'][i] = 'a'+str(i-1) + r1['col3'][i] = -1.0 + else: + r1['col1'][i] = i + r1['col2'][i] = 'b'+str(i) + r1['col3'][i] = 0.0 + + # Read the modified table + if self.reopen: + self._reopen() + table = self.h5file.root.recarray + table.nrowsinbuf = self.buffersize # set buffer value + r2 = table.read() + if common.verbose: + print("Original table-->", repr(r2)) + print("Should look like-->", repr(r1)) + self.assertEqual(r1.tobytes(), r2.tobytes()) + self.assertEqual(table.nrows, nrows) + + +class UpdateRowTestCase1(UpdateRowTestCase): + reopen = 0 + buffersize = 1 + + +class UpdateRowTestCase2(UpdateRowTestCase): + reopen = 1 + buffersize = 2 + + +class UpdateRowTestCase3(UpdateRowTestCase): + reopen = 0 + buffersize = 1000 + + +class UpdateRowTestCase4(UpdateRowTestCase): + reopen = 1 + buffersize = 1000 + + +class RecArrayIO(common.TempFileMixin, common.PyTablesTestCase): + def test00(self): + """Checking saving a regular recarray""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test00..." % self.__class__.__name__) + + # Create a recarray + r = np.rec.array( + [(456, b'dbe', 1.2), (2, b'de', 1.3)], names='col1,col2,col3') + + # Save it in a table: + self.h5file.create_table(self.h5file.root, 'recarray', r) + + # Read it again + if self.reopen: + self._reopen() + r2 = self.h5file.root.recarray.read() + self.assertEqual(r.tobytes(), r2.tobytes()) + + def test01(self): + """Checking saving a recarray with an offset in its buffer""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test01..." % self.__class__.__name__) + + # Create a recarray + r = np.rec.array( + [(456, b'dbe', 1.2), (2, b'de', 1.3)], names='col1,col2,col3') + + # Get an offsetted bytearray + r1 = r[1:] + + # Save it in a table: + self.h5file.create_table(self.h5file.root, 'recarray', r1) + + # Read it again + if self.reopen: + self._reopen() + r2 = self.h5file.root.recarray.read() + + self.assertEqual(r1.tobytes(), r2.tobytes()) + + def test02(self): + """Checking saving a large recarray with an offset in its buffer""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test02..." % self.__class__.__name__) + + # Create a recarray + r = np.rec.array(b'a'*200_000, 'f4,3i4,a5,i2', 3000) + + # Get an offsetted bytearray + r1 = r[2000:] + + # Save it in a table: + self.h5file.create_table(self.h5file.root, 'recarray', r1) + + # Read it again + if self.reopen: + self._reopen() + r2 = self.h5file.root.recarray.read() + + self.assertEqual(r1.tobytes(), r2.tobytes()) + + def test03(self): + """Checking saving a strided recarray with an offset in its buffer""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test03..." % self.__class__.__name__) + + # Create a recarray + r = np.rec.array(b'a'*200_000, 'f4,3i4,a5,i2', 3000) + + # Get an strided recarray + r2 = r[::2] + + # Get an offsetted bytearray + r1 = r2[1200:] + + # Save it in a table: + self.h5file.create_table(self.h5file.root, 'recarray', r1) + + # Read it again + if self.reopen: + self._reopen() + r2 = self.h5file.root.recarray.read() + + self.assertEqual(r1.tobytes(), r2.tobytes()) + + def test04(self): + """Checking appending several rows at once""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test04..." % self.__class__.__name__) + + class Rec(tb.IsDescription): + col1 = tb.IntCol(pos=1) + col2 = tb.StringCol(itemsize=3, pos=2) + col3 = tb.FloatCol(pos=3) + + # Save it in a table: + table = self.h5file.create_table(self.h5file.root, 'recarray', Rec) + + # append new rows + r = np.rec.array( + [(456, b'dbe', 1.2), (2, b'ded', 1.3)], formats="i4,a3,f8") + table.append(r) + table.append([(457, b'db1', 1.2), (5, b'de1', 1.3)]) + # Create the complete table + r1 = np.rec.array([(456, b'dbe', 1.2), (2, b'ded', 1.3), + (457, b'db1', 1.2), (5, b'de1', 1.3)], + formats="i4,a3,f8", + names="col1,col2,col3") + # Read the original table + if self.reopen: + self._reopen() + table = self.h5file.root.recarray + r2 = self.h5file.root.recarray.read() + if common.verbose: + print("Original table-->", repr(r2)) + print("Should look like-->", repr(r1)) + self.assertEqual(r1.tobytes(), r2.tobytes()) + self.assertEqual(table.nrows, 4) + + def test05(self): + """Checking appending several rows at once (close file version)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test05..." % self.__class__.__name__) + + # Save it in a table: + table = self.h5file.create_table(self.h5file.root, 'recarray', Rec) + + # append new rows + r = np.rec.array( + [(456, b'dbe', 1.2), (2, b'ded', 1.3)], formats="i4,a3,f8") + table.append(r) + table.append([(457, b'db1', 1.2), (5, b'de1', 1.3)]) + + self._reopen() + + table = self.h5file.root.recarray + # Create the complete table + r1 = np.rec.array([(456, b'dbe', 1.2), (2, b'ded', 1.3), + (457, b'db1', 1.2), (5, b'de1', 1.3)], + formats="i4,a3,f8", + names="col1,col2,col3") + + # Read the original table + if self.reopen: + self._reopen() + table = self.h5file.root.recarray + r2 = self.h5file.root.recarray.read() + if common.verbose: + print("Original table-->", repr(r2)) + print("Should look like-->", repr(r1)) + self.assertEqual(r1.tobytes(), r2.tobytes()) + self.assertEqual(table.nrows, 4) + + def test06a(self): + """Checking modifying one table row (list version)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test06a..." % self.__class__.__name__) + + # Create a new table: + table = self.h5file.create_table(self.h5file.root, 'recarray', Rec) + + # append new rows + r = np.rec.array( + [(456, b'dbe', 1.2), (2, b'ded', 1.3)], formats="i4,a3,f8") + table.append(r) + table.append([(457, b'db1', 1.2), (5, b'de1', 1.3)]) + # Modify just one existing rows + table.modify_rows(start=1, rows=[(456, 'db1', 1.2)]) + # Create the modified recarray + r1 = np.rec.array([(456, b'dbe', 1.2), (456, b'db1', 1.2), + (457, b'db1', 1.2), (5, b'de1', 1.3)], + formats="i4,a3,f8", + names="col1,col2,col3") + + # Read the modified table + if self.reopen: + self._reopen() + table = self.h5file.root.recarray + r2 = table.read() + if common.verbose: + print("Original table-->", repr(r2)) + print("Should look like-->", repr(r1)) + self.assertEqual(r1.tobytes(), r2.tobytes()) + self.assertEqual(table.nrows, 4) + + def test06b(self): + """Checking modifying one table row (recarray version)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test06b..." % self.__class__.__name__) + + # Create a new table: + table = self.h5file.create_table(self.h5file.root, 'recarray', Rec) + + # append new rows + r = np.rec.array( + [(456, b'dbe', 1.2), (2, b'ded', 1.3)], formats="i4,a3,f8") + table.append(r) + table.append([(457, b'db1', 1.2), (5, b'de1', 1.3)]) + # Modify just one existing rows + table.modify_rows( + start=2, + rows=np.rec.array([(456, 'db2', 1.2)], formats="i4,a3,f8")) + # Create the modified recarray + r1 = np.rec.array([(456, b'dbe', 1.2), (2, b'ded', 1.3), + (456, b'db2', 1.2), (5, b'de1', 1.3)], + formats="i4,a3,f8", + names="col1,col2,col3") + # Read the modified table + if self.reopen: + self._reopen() + table = self.h5file.root.recarray + r2 = table.read() + if common.verbose: + print("Original table-->", repr(r2)) + print("Should look like-->", repr(r1)) + self.assertEqual(r1.tobytes(), r2.tobytes()) + self.assertEqual(table.nrows, 4) + + def test07a(self): + """Checking modifying several rows at once (list version)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test07a..." % self.__class__.__name__) + + # Create a new table: + table = self.h5file.create_table(self.h5file.root, 'recarray', Rec) + + # append new rows + r = np.rec.array( + [(456, b'dbe', 1.2), (2, b'ded', 1.3)], formats="i4,a3,f8") + table.append(r) + table.append([(457, b'db1', 1.2), (5, b'de1', 1.3)]) + # Modify two existing rows + table.modify_rows(start=1, rows=[(457, 'db1', 1.2), (5, 'de1', 1.3)]) + # Create the modified recarray + r1 = np.rec.array([(456, b'dbe', 1.2), (457, b'db1', 1.2), + (5, b'de1', 1.3), (5, b'de1', 1.3)], + formats="i4,a3,f8", + names="col1,col2,col3") + # Read the modified table + if self.reopen: + self._reopen() + table = self.h5file.root.recarray + r2 = table.read() + if common.verbose: + print("Original table-->", repr(r2)) + print("Should look like-->", repr(r1)) + self.assertEqual(r1.tobytes(), r2.tobytes()) + self.assertEqual(table.nrows, 4) + + def test07b(self): + """Checking modifying several rows at once (recarray version)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test07b..." % self.__class__.__name__) + + # Create a new table: + table = self.h5file.create_table(self.h5file.root, 'recarray', Rec) + + # append new rows + r = np.rec.array( + [(456, b'dbe', 1.2), (2, b'ded', 1.3)], formats="i4,a3,f8") + table.append(r) + table.append([(457, b'db1', 1.2), (5, b'de1', 1.3)]) + # Modify two existing rows + rows = np.rec.array([(457, b'db1', 1.2), (5, b'de1', 1.3)], + formats="i4,a3,f8") + table.modify_rows(start=1, rows=rows) + # Create the modified recarray + r1 = np.rec.array([(456, b'dbe', 1.2), (457, b'db1', 1.2), + (5, b'de1', 1.3), (5, b'de1', 1.3)], + formats="i4,a3,f8", + names="col1,col2,col3") + # Read the modified table + if self.reopen: + self._reopen() + table = self.h5file.root.recarray + r2 = table.read() + if common.verbose: + print("Original table-->", repr(r2)) + print("Should look like-->", repr(r1)) + self.assertEqual(r1.tobytes(), r2.tobytes()) + self.assertEqual(table.nrows, 4) + + def test07c(self): + """Checking modifying several rows with a mismatching value""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test07c..." % self.__class__.__name__) + + # Create a new table: + table = self.h5file.create_table(self.h5file.root, 'recarray', Rec) + + # append new rows + r = np.rec.array( + [(456, b'dbe', 1.2), (2, b'ded', 1.3)], formats="i4,a3,f8") + table.append(r) + table.append([(457, b'db1', 1.2), (5, b'de1', 1.3)]) + # Modify two existing rows + rows = np.rec.array( + [(457, b'db1', 1.2), (5, b'de1', 1.3)], formats="i4,a3,f8") + self.assertRaises(ValueError, table.modify_rows, + start=1, stop=2, rows=rows) + + def test08a(self): + """Checking modifying one column (single column version)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test08a..." % self.__class__.__name__) + + # Create a new table: + table = self.h5file.create_table(self.h5file.root, 'recarray', Rec) + + # append new rows + r = np.rec.array( + [(456, b'dbe', 1.2), (2, b'ded', 1.3)], formats="i4,a3,f8") + table.append(r) + table.append([(457, b'db1', 1.2), (5, b'de1', 1.3)]) + + # Modify just one existing column + table.modify_columns(start=1, columns=[[2, 3, 4]], names=["col1"]) + # Create the modified recarray + r1 = np.rec.array([(456, b'dbe', 1.2), (2, b'ded', 1.3), + (3, b'db1', 1.2), (4, b'de1', 1.3)], + formats="i4,a3,f8", + names="col1,col2,col3") + # Read the modified table + if self.reopen: + self._reopen() + table = self.h5file.root.recarray + r2 = table.read() + if common.verbose: + print("Original table-->", repr(r2)) + print("Should look like-->", repr(r1)) + self.assertEqual(r1.tobytes(), r2.tobytes()) + self.assertEqual(table.nrows, 4) + + def test08a2(self): + """Checking modifying one column (single column version, + modify_column)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test08a2..." % self.__class__.__name__) + + # Create a new table: + table = self.h5file.create_table(self.h5file.root, 'recarray', Rec) + + # append new rows + r = np.rec.array( + [(456, b'dbe', 1.2), (2, b'ded', 1.3)], formats="i4,a3,f8") + table.append(r) + table.append([(457, b'db1', 1.2), (5, b'de1', 1.3)]) + + # Modify just one existing column + table.modify_column(start=1, column=[2, 3, 4], colname="col1") + # Create the modified recarray + r1 = np.rec.array([(456, b'dbe', 1.2), (2, b'ded', 1.3), + (3, b'db1', 1.2), (4, b'de1', 1.3)], + formats="i4,a3,f8", + names="col1,col2,col3") + # Read the modified table + if self.reopen: + self._reopen() + table = self.h5file.root.recarray + r2 = table.read() + if common.verbose: + print("Original table-->", repr(r2)) + print("Should look like-->", repr(r1)) + self.assertEqual(r1.tobytes(), r2.tobytes()) + self.assertEqual(table.nrows, 4) + + def test08b(self): + """Checking modifying one column (single column version, recarray)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test08b..." % self.__class__.__name__) + + # Create a new table: + table = self.h5file.create_table(self.h5file.root, 'recarray', Rec) + + # append new rows + r = np.rec.array( + [(456, b'dbe', 1.2), (2, b'ded', 1.3)], formats="i4,a3,f8") + table.append(r) + table.append([(457, b'db1', 1.2), (5, b'de1', 1.3)]) + + # Modify just one existing column + columns = np.rec.fromarrays(np.array([[2, 3, 4]]), formats="i4") + table.modify_columns(start=1, columns=columns, names=["col1"]) + # Create the modified recarray + r1 = np.rec.array([(456, b'dbe', 1.2), (2, b'ded', 1.3), + (3, b'db1', 1.2), (4, b'de1', 1.3)], + formats="i4,a3,f8", + names="col1,col2,col3") + # Read the modified table + if self.reopen: + self._reopen() + table = self.h5file.root.recarray + r2 = table.read() + if common.verbose: + print("Original table-->", repr(r2)) + print("Should look like-->", repr(r1)) + self.assertEqual(r1.tobytes(), r2.tobytes()) + self.assertEqual(table.nrows, 4) + + def test08b2(self): + """Checking modifying one column (single column version, recarray, + modify_column)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test08b2..." % self.__class__.__name__) + + # Create a new table: + table = self.h5file.create_table(self.h5file.root, 'recarray', Rec) + + # append new rows + r = np.rec.array( + [(456, b'dbe', 1.2), (2, b'ded', 1.3)], formats="i4,a3,f8") + table.append(r) + table.append([(457, b'db1', 1.2), (5, b'de1', 1.3)]) + + # Modify just one existing column + columns = np.rec.fromarrays(np.array([[2, 3, 4]]), formats="i4") + table.modify_column(start=1, column=columns, colname="col1") + # Create the modified recarray + r1 = np.rec.array([(456, b'dbe', 1.2), (2, b'ded', 1.3), + (3, b'db1', 1.2), (4, b'de1', 1.3)], + formats="i4,a3,f8", + names="col1,col2,col3") + # Read the modified table + if self.reopen: + self._reopen() + table = self.h5file.root.recarray + r2 = table.read() + if common.verbose: + print("Original table-->", repr(r2)) + print("Should look like-->", repr(r1)) + self.assertEqual(r1.tobytes(), r2.tobytes()) + self.assertEqual(table.nrows, 4) + + def test08c(self): + """Checking modifying one column (single column version, + single element)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test08c..." % self.__class__.__name__) + + # Create a new table: + table = self.h5file.create_table(self.h5file.root, 'recarray', Rec) + + # append new rows + r = np.rec.array( + [(456, b'dbe', 1.2), (2, b'ded', 1.3)], formats="i4,a3,f8") + table.append(r) + table.append([(457, b'db1', 1.2), (5, b'de1', 1.3)]) + + # Modify just one existing column + # columns = np.rec.fromarrays(np.array([[4]]), formats="i4") + # table.modify_columns(start=1, columns=columns, names=["col1"]) + table.modify_columns(start=1, columns=[[4]], names=["col1"]) + # Create the modified recarray + r1 = np.rec.array([(456, b'dbe', 1.2), (4, b'ded', 1.3), + (457, b'db1', 1.2), (5, b'de1', 1.3)], + formats="i4,a3,f8", + names="col1,col2,col3") + # Read the modified table + if self.reopen: + self._reopen() + table = self.h5file.root.recarray + r2 = table.read() + if common.verbose: + print("Original table-->", repr(r2)) + print("Should look like-->", repr(r1)) + self.assertEqual(r1.tobytes(), r2.tobytes()) + self.assertEqual(table.nrows, 4) + + def test09a(self): + """Checking modifying table columns (multiple column version)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test09a..." % self.__class__.__name__) + + # Create a new table: + table = self.h5file.create_table(self.h5file.root, 'recarray', Rec) + + # append new rows + r = np.rec.array( + [(456, b'dbe', 1.2), (2, b'ded', 1.3)], formats="i4,a3,f8") + table.append(r) + table.append([(457, b'db1', 1.2), (5, b'de1', 1.3)]) + + # Modify a couple of columns + columns = [["aaa", "bbb", "ccc"], [1.2, .1, .3]] + table.modify_columns(start=1, columns=columns, names=["col2", "col3"]) + # Create the modified recarray + r1 = np.rec.array([(456, b'dbe', 1.2), (2, b'aaa', 1.2), + (457, b'bbb', .1), (5, b'ccc', .3)], + formats="i4,a3,f8", + names="col1,col2,col3") + + # Read the modified table + if self.reopen: + self._reopen() + table = self.h5file.root.recarray + r2 = table.read() + if common.verbose: + print("Original table-->", repr(r2)) + print("Should look like-->", repr(r1)) + self.assertEqual(r1.tobytes(), r2.tobytes()) + self.assertEqual(table.nrows, 4) + + def test09b(self): + """Checking modifying table columns (multiple columns, recarray)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test09b..." % self.__class__.__name__) + + # Create a new table: + table = self.h5file.create_table(self.h5file.root, 'recarray', Rec) + + # append new rows + r = np.rec.array( + [(456, b'dbe', 1.2), (2, b'ded', 1.3)], formats="i4,a3,f8") + table.append(r) + table.append([(457, b'db1', 1.2), (5, b'de1', 1.3)]) + + # Modify a couple of columns + columns = np.rec.array([("aaa", 1.2), ("bbb", .1), ("ccc", .3)], + formats="a3,f8") + table.modify_columns(start=1, columns=columns, names=["col2", "col3"]) + # Create the modified recarray + r1 = np.rec.array([(456, 'dbe', 1.2), (2, 'aaa', 1.2), + (457, 'bbb', .1), (5, 'ccc', .3)], + formats="i4,a3,f8", + names="col1,col2,col3") + # Read the modified table + if self.reopen: + self._reopen() + table = self.h5file.root.recarray + r2 = table.read() + if common.verbose: + print("Original table-->", repr(r2)) + print("Should look like-->", repr(r1)) + self.assertEqual(r1.tobytes(), r2.tobytes()) + self.assertEqual(table.nrows, 4) + + def test09c(self): + """Checking modifying table columns (single column, step)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test09c..." % self.__class__.__name__) + + # Create a new table: + table = self.h5file.create_table(self.h5file.root, 'recarray', Rec) + + # append new rows + r = np.rec.array( + [(456, b'dbe', 1.2), (2, b'ded', 1.3)], formats="i4,a3,f8") + table.append(r) + table.append([(457, b'db1', 1.2), (5, b'de1', 1.3)]) + # Modify a couple of columns + columns = np.rec.array([("aaa", 1.2), ("bbb", .1)], formats="a3,f8") + table.modify_columns(start=1, step=2, columns=columns, + names=["col2", "col3"]) + # Create the modified recarray + r1 = np.rec.array([(456, 'dbe', 1.2), (2, 'aaa', 1.2), + (457, 'db1', 1.2), (5, 'bbb', .1)], + formats="i4,a3,f8", + names="col1,col2,col3") + # Read the modified table + if self.reopen: + self._reopen() + table = self.h5file.root.recarray + r2 = table.read() + if common.verbose: + print("Original table-->", repr(r2)) + print("Should look like-->", repr(r1)) + self.assertEqual(r1.tobytes(), r2.tobytes()) + self.assertEqual(table.nrows, 4) + + def test09d(self): + """Checking modifying table columns (multiple columns, step)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test09d..." % self.__class__.__name__) + + # Create a new table: + table = self.h5file.create_table(self.h5file.root, 'recarray', Rec) + + # append new rows + r = np.rec.array( + [(456, b'dbe', 1.2), (2, b'ded', 1.3)], formats="i4,a3,f8") + table.append(r) + table.append([(457, b'db1', 1.2), (5, b'de1', 1.3)]) + + # Modify a couple of columns + columns = np.rec.array([("aaa", 1.3), ("bbb", .1)], formats="a3,f8") + table.modify_columns(start=0, step=2, columns=columns, + names=["col2", "col3"]) + # Create the modified recarray + r1 = np.rec.array([(456, 'aaa', 1.3), (2, 'ded', 1.3), + (457, 'bbb', .1), (5, 'de1', 1.3)], + formats="i4,a3,f8", + names="col1,col2,col3") + # Read the modified table + if self.reopen: + self._reopen() + table = self.h5file.root.recarray + r2 = table.read() + if common.verbose: + print("Original table-->", repr(r2)) + print("Should look like-->", repr(r1)) + self.assertEqual(r1.tobytes(), r2.tobytes()) + self.assertEqual(table.nrows, 4) + + def test10a(self): + """Checking modifying rows using coordinates + (readCoords/modifyCoords).""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test10a..." % self.__class__.__name__) + + # Create a new table: + table = self.h5file.create_table(self.h5file.root, 'recarray', Rec) + + # append new rows + r = np.rec.array( + [(456, b'dbe', 1.2), (2, b'ded', 1.3)], formats="i4,a3,f8") + table.append(r) + table.append([(457, b'db1', 1.2), (5, b'de1', 1.3)]) + + columns = table.read_coordinates([0, 3]) + + # Modify both rows + columns['col1'][:] = [55, 56] + columns['col3'][:] = [1.9, 1.8] + + # Modify the table in the same coordinates + table.modify_coordinates([0, 3], columns) + + # Create the modified recarray + r1 = np.rec.array([(55, b'dbe', 1.9), (2, b'ded', 1.3), + (457, b'db1', 1.2), (56, b'de1', 1.8)], + formats="i4,a3,f8", + names="col1,col2,col3") + # Read the modified table + if self.reopen: + self._reopen() + table = self.h5file.root.recarray + r2 = table.read() + if common.verbose: + print("Original table-->", repr(r2)) + print("Should look like-->", repr(r1)) + self.assertEqual(r1.tobytes(), r2.tobytes()) + self.assertEqual(table.nrows, 4) + + def test10b(self): + """Checking modifying rows using coordinates (getitem/setitem).""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test10b..." % self.__class__.__name__) + + # Create a new table: + table = self.h5file.create_table(self.h5file.root, 'recarray', Rec) + + # append new rows + r = np.rec.array( + [(456, b'dbe', 1.2), (2, b'ded', 1.3)], formats="i4,a3,f8") + table.append(r) + table.append([(457, b'db1', 1.2), (5, b'de1', 1.3)]) + + columns = table[[0, 3]] + + # Modify both rows + columns['col1'][:] = [55, 56] + columns['col3'][:] = [1.9, 1.8] + + # Modify the table in the same coordinates + table[[0, 3]] = columns + + # Create the modified recarray + r1 = np.rec.array([(55, b'dbe', 1.9), (2, b'ded', 1.3), + (457, b'db1', 1.2), (56, b'de1', 1.8)], + formats="i4,a3,f8", + names="col1,col2,col3") + # Read the modified table + if self.reopen: + self._reopen() + table = self.h5file.root.recarray + r2 = table.read() + if common.verbose: + print("Original table-->", repr(r2)) + print("Should look like-->", repr(r1)) + self.assertEqual(r1.tobytes(), r2.tobytes()) + self.assertEqual(table.nrows, 4) + + +class RecArrayIO1(RecArrayIO): + reopen = 0 + + +class RecArrayIO2(RecArrayIO): + reopen = 1 + + +class CopyTestCase(common.TempFileMixin, common.PyTablesTestCase): + def assertEqualColinstances(self, table1, table2): + """Assert that column instance maps of both tables are equal.""" + + cinst1, cinst2 = table1.colinstances, table2.colinstances + self.assertEqual(len(cinst1), len(cinst2)) + for (cpathname, col1) in cinst1.items(): + self.assertTrue(cpathname in cinst2) + col2 = cinst2[cpathname] + self.assertIsInstance(col1, type(col2)) + if isinstance(col1, tb.Column): + self.assertEqual(col1.name, col2.name) + self.assertEqual(col1.pathname, col2.pathname) + self.assertEqual(col1.dtype, col2.dtype) + self.assertEqual(col1.type, col2.type) + elif isinstance(col1, tb.Cols): + self.assertEqual(col1._v_colnames, col2._v_colnames) + self.assertEqual(col1._v_colpathnames, col2._v_colpathnames) + + def test01_copy(self): + """Checking Table.copy() method.""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test01_copy..." % self.__class__.__name__) + + # Create a recarray + r = np.rec.array( + [(456, b'dbe', 1.2), (2, b'de', 1.3)], + names='col1,col2,col3', formats=('i4,S3,f8'), aligned=self.aligned) + # Save it in a table: + table1 = self.h5file.create_table(self.h5file.root, 'table1', r, + "title table1") + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen(mode='a') + table1 = self.h5file.root.table1 + + # Copy to another table + table2 = table1.copy('/', 'table2') + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen(mode='a') + table1 = self.h5file.root.table1 + table2 = self.h5file.root.table2 + + if common.verbose: + print("table1-->", table1.read()) + print("table2-->", table2.read()) + # print "dirs-->", dir(table1), dir(table2) + print("attrs table1-->", repr(table1.attrs)) + print("attrs table2-->", repr(table2.attrs)) + + # Check that all the elements are equal + for row1 in table1: + nrow = row1.nrow # current row + # row1 is a Row instance, while table2[] is a + # RecArray.Record instance + # print "reprs-->", repr(row1), repr(table2.read(nrow)) + for colname in table1.colnames: + # Both ways to compare works well + # self.assertEqual(row1[colname], table2[nrow][colname)) + self.assertEqual(row1[colname], + table2.read(nrow, field=colname)[0]) + + # Assert other properties in table + self.assertEqual(table1.nrows, table2.nrows) + self.assertEqual(table1.shape, table2.shape) + self.assertEqual(table1.colnames, table2.colnames) + self.assertEqual(table1.coldtypes, table2.coldtypes) + self.assertEqualColinstances(table1, table2) + self.assertEqual(repr(table1.description), repr(table2.description)) + # Check alignment + if self.aligned and self.open_kwargs['allow_padding'] is True: + self.assertEqual(table1.description._v_offsets, [0, 4, 8]) + self.assertEqual(table1.description._v_itemsize, 16) + else: + self.assertEqual(table1.description._v_offsets, [0, 4, 7]) + self.assertEqual(table1.description._v_itemsize, 15) + self.assertEqual(table1.description._v_offsets, + table2.description._v_offsets) + self.assertEqual(table1.description._v_itemsize, + table2.description._v_itemsize) + + # This could be not the same when re-opening the file + # self.assertEqual(table1.description._v_ColObjects, + # table2.description._v_ColObjects) + # Leaf attributes + self.assertEqual(table1.title, table2.title) + self.assertEqual(table1.filters.complevel, table2.filters.complevel) + self.assertEqual(table1.filters.complib, table2.filters.complib) + self.assertEqual(table1.filters.shuffle, table2.filters.shuffle) + self.assertEqual(table1.filters.fletcher32, table2.filters.fletcher32) + + def test02_copy(self): + """Checking Table.copy() method (where specified)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test02_copy..." % self.__class__.__name__) + + # Create a recarray + r = np.rec.array( + [(b'dbe', 456, 1.2), (b'de', 2, 1.3)], + names='col1,col2,col3', formats="S3,i4,f8", aligned=self.aligned) + # Save it in a table: + table1 = self.h5file.create_table(self.h5file.root, 'table1', r, + "title table1") + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen(mode='a') + table1 = self.h5file.root.table1 + + # Copy to another table in another group + group1 = self.h5file.create_group("/", "group1") + table2 = table1.copy(group1, 'table2') + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen() + table1 = self.h5file.root.table1 + table2 = self.h5file.root.group1.table2 + + if common.verbose: + print("table1-->", table1.read()) + print("table2-->", table2.read()) + print("attrs table1-->", repr(table1.attrs)) + print("attrs table2-->", repr(table2.attrs)) + + # Check that all the elements are equal + for row1 in table1: + nrow = row1.nrow # current row + for colname in table1.colnames: + # Both ways to compare works well + # self.assertEqual(row1[colname], table2[nrow][colname)) + self.assertEqual(row1[colname], + table2.read(nrow, field=colname)[0]) + + # Assert other properties in table + self.assertEqual(table1.nrows, table2.nrows) + self.assertEqual(table1.shape, table2.shape) + self.assertEqual(table1.colnames, table2.colnames) + self.assertEqual(table1.coldtypes, table2.coldtypes) + self.assertEqualColinstances(table1, table2) + self.assertEqual(repr(table1.description), repr(table2.description)) + # Check alignment + if self.aligned and self.open_kwargs['allow_padding'] is True: + self.assertEqual(table1.description._v_offsets, [0, 4, 8]) + self.assertEqual(table1.description._v_itemsize, 16) + else: + self.assertEqual(table1.description._v_offsets, [0, 3, 7]) + self.assertEqual(table1.description._v_itemsize, 15) + self.assertEqual(table1.description._v_offsets, + table2.description._v_offsets) + self.assertEqual(table1.description._v_itemsize, + table2.description._v_itemsize) + + # Leaf attributes + self.assertEqual(table1.title, table2.title) + self.assertEqual(table1.filters.complevel, table2.filters.complevel) + self.assertEqual(table1.filters.complib, table2.filters.complib) + self.assertEqual(table1.filters.shuffle, table2.filters.shuffle) + self.assertEqual(table1.filters.fletcher32, table2.filters.fletcher32) + + def test03_copy(self): + """Checking Table.copy() method (table larger than buffer)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test03_copy..." % self.__class__.__name__) + + # Create a recarray exceeding buffers capability + # This works, but takes too much CPU for a test + # It is better to reduce the buffer size (table1.nrowsinbuf) + # r=np.rec.array(b'aaaabbbbccccddddeeeeffffgggg'*20000, + # formats='2i2,i4, (2,3)u2, (1,)f4, f8',shape=700) + r = np.rec.array( + b'aaaabbbbccccddddeeeeffffgggg' * 200, + formats='2i2,i4, (2,3)u2, (1,)f4, f8', shape=7, + aligned=self.aligned) + # Save it in a table: + table1 = self.h5file.create_table(self.h5file.root, 'table1', r, + "title table1") + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen(mode='a') + table1 = self.h5file.root.table1 + + # Copy to another table in another group and other title + group1 = self.h5file.create_group("/", "group1") + table1.nrowsinbuf = 2 # small value of buffer + table2 = table1.copy(group1, 'table2', title="title table2") + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen() + table1 = self.h5file.root.table1 + table2 = self.h5file.root.group1.table2 + + if common.verbose: + print("table1-->", table1.read()) + print("table2-->", table2.read()) + print("attrs table1-->", repr(table1.attrs)) + print("attrs table2-->", repr(table2.attrs)) + + # Check that all the elements are equal + for row1 in table1: + nrow = row1.nrow # current row + for colname in table1.colnames: + # self.assertTrue(allequal(row1[colname], + # table2[nrow][colname])) + self.assertTrue(common.allequal( + row1[colname], table2.read(nrow, field=colname)[0])) + + # Assert other properties in table + self.assertEqual(table1.nrows, table2.nrows) + self.assertEqual(table1.shape, table2.shape) + self.assertEqual(table1.colnames, table2.colnames) + self.assertEqual(table1.coldtypes, table2.coldtypes) + self.assertEqualColinstances(table1, table2) + self.assertEqual(repr(table1.description), repr(table2.description)) + # Check alignment + if self.aligned and self.open_kwargs['allow_padding'] is True: + self.assertEqual(table1.description._v_offsets, [0, 4, 8, 20, 24]) + self.assertEqual(table1.description._v_itemsize, 32) + else: + self.assertEqual(table1.description._v_offsets, [0, 4, 8, 20, 24]) + self.assertEqual(table1.description._v_itemsize, 32) + self.assertEqual(table1.description._v_offsets, + table2.description._v_offsets) + self.assertEqual(table1.description._v_itemsize, + table2.description._v_itemsize) + + # Leaf attributes + self.assertEqual("title table2", table2.title) + self.assertEqual(table1.filters.complevel, table2.filters.complevel) + self.assertEqual(table1.filters.complib, table2.filters.complib) + self.assertEqual(table1.filters.shuffle, table2.filters.shuffle) + self.assertEqual(table1.filters.fletcher32, table2.filters.fletcher32) + + def test04_copy(self): + """Checking Table.copy() method (different compress level)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test04_copy..." % self.__class__.__name__) + + # Create a recarray + r = np.rec.array([(1.2, b'dbe', 456), (1.3, b'de', 2)], + names='col1,col2,col3', formats="f8,S3,i4", + aligned=self.aligned) + # Save it in a table: + table1 = self.h5file.create_table(self.h5file.root, 'table1', r, + "title table1") + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen(mode='a') + table1 = self.h5file.root.table1 + + # Copy to another table in another group + group1 = self.h5file.create_group("/", "group1") + table2 = table1.copy(group1, 'table2', + filters=tb.Filters(complevel=6)) + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen() + table1 = self.h5file.root.table1 + table2 = self.h5file.root.group1.table2 + + if common.verbose: + print("table1-->", table1.read()) + print("table2-->", table2.read()) + print("attrs table1-->", repr(table1.attrs)) + print("attrs table2-->", repr(table2.attrs)) + + # Check that all the elements are equal + for row1 in table1: + nrow = row1.nrow # current row + for colname in table1.colnames: + # Both ways to compare works well + # self.assertEqual(row1[colname], table2[nrow][colname)) + self.assertEqual(row1[colname], + table2.read(nrow, field=colname)[0]) + + # Assert other properties in table + self.assertEqual(table1.nrows, table2.nrows) + self.assertEqual(table1.shape, table2.shape) + self.assertEqual(table1.colnames, table2.colnames) + self.assertEqual(table1.coldtypes, table2.coldtypes) + self.assertEqualColinstances(table1, table2) + self.assertEqual(repr(table1.description), repr(table2.description)) + # Check alignment + if self.aligned and self.open_kwargs['allow_padding'] is True: + self.assertEqual(table1.description._v_offsets, [0, 8, 12]) + self.assertEqual(table1.description._v_itemsize, 16) + else: + self.assertEqual(table1.description._v_offsets, [0, 8, 11]) + self.assertEqual(table1.description._v_itemsize, 15) + self.assertEqual(table1.description._v_offsets, + table2.description._v_offsets) + self.assertEqual(table1.description._v_itemsize, + table2.description._v_itemsize) + + # Leaf attributes + self.assertEqual(table1.title, table2.title) + self.assertEqual(6, table2.filters.complevel) + self.assertEqual(1, table2.filters.shuffle) + self.assertEqual(table1.filters.fletcher32, table2.filters.fletcher32) + + def test05_copy(self): + """Checking Table.copy() method (user attributes copied)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test05_copy..." % self.__class__.__name__) + + # Create a recarray + r = np.rec.array([(456, b'dbe', 1.2), (2, b'de', 1.3)], + names='col1,col2,col3', formats='i8,S3,f8', + aligned=self.aligned) + # Save it in a table: + table1 = self.h5file.create_table(self.h5file.root, 'table1', r, + "title table1") + # Add some user attributes + table1.attrs.attr1 = "attr1" + table1.attrs.attr2 = 2 + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen(mode='a') + table1 = self.h5file.root.table1 + + # Copy to another table in another group + group1 = self.h5file.create_group("/", "group1") + table2 = table1.copy(group1, 'table2', + copyuserattrs=1, + filters=tb.Filters(complevel=6)) + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen() + table1 = self.h5file.root.table1 + table2 = self.h5file.root.group1.table2 + + if common.verbose: + print("table1-->", table1.read()) + print("table2-->", table2.read()) + print("attrs table1-->", repr(table1.attrs)) + print("attrs table2-->", repr(table2.attrs)) + + # Check that all the elements are equal + for row1 in table1: + nrow = row1.nrow # current row + for colname in table1.colnames: + # self.assertEqual(row1[colname], table2[nrow][colname)) + self.assertEqual(row1[colname], + table2.read(nrow, field=colname)[0]) + + # Assert other properties in table + self.assertEqual(table1.nrows, table2.nrows) + self.assertEqual(table1.shape, table2.shape) + self.assertEqual(table1.colnames, table2.colnames) + self.assertEqual(table1.coldtypes, table2.coldtypes) + self.assertEqualColinstances(table1, table2) + self.assertEqual(repr(table1.description), repr(table2.description)) + # Check alignment + if self.aligned and self.open_kwargs['allow_padding'] is True: + # The conditions for guessing the correct alignment are very + # tricky, so better disable the checks. Feel free to re-enable + # them during debugging by removing the False condition below. + if False: + if is_os_64bit() and is_python_64bit(): + self.assertEqual(table1.description._v_offsets, [0, 8, 16]) + self.assertEqual(table1.description._v_itemsize, 24) + elif not is_os_64bit() and not is_python_64bit(): + self.assertEqual(table1.description._v_offsets, [0, 8, 12]) + self.assertEqual(table1.description._v_itemsize, 20) + else: + self.assertEqual(table1.description._v_offsets, [0, 8, 11]) + self.assertEqual(table1.description._v_itemsize, 19) + self.assertEqual(table1.description._v_offsets, + table2.description._v_offsets) + self.assertEqual(table1.description._v_itemsize, + table2.description._v_itemsize) + + # Leaf attributes + self.assertEqual(table1.title, table2.title) + self.assertEqual(6, table2.filters.complevel) + self.assertEqual(1, table2.filters.shuffle) + self.assertEqual(table1.filters.fletcher32, table2.filters.fletcher32) + # User attributes + self.assertEqual(table2.attrs.attr1, "attr1") + self.assertEqual(table2.attrs.attr2, 2) + + def test05b_copy(self): + """Checking Table.copy() method (user attributes not copied)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test05b_copy..." % self.__class__.__name__) + + # Create a recarray + r = np.rec.array([(456, b'dbe', 1.2), (2, b'de', 1.3)], + names='col1,col2,col3', formats='i8,S3,f4', + aligned=self.aligned) + # Save it in a table: + table1 = self.h5file.create_table(self.h5file.root, 'table1', r, + "title table1") + + # Add some user attributes + table1.attrs.attr1 = "attr1" + table1.attrs.attr2 = 2 + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen(mode='a') + table1 = self.h5file.root.table1 + + # Copy to another table in another group + group1 = self.h5file.create_group("/", "group1") + table2 = table1.copy(group1, 'table2', + copyuserattrs=0, + filters=tb.Filters(complevel=6)) + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen() + table1 = self.h5file.root.table1 + table2 = self.h5file.root.group1.table2 + + if common.verbose: + print("table1-->", table1.read()) + print("table2-->", table2.read()) + print("attrs table1-->", repr(table1.attrs)) + print("attrs table2-->", repr(table2.attrs)) + + # Check that all the elements are equal + for row1 in table1: + nrow = row1.nrow # current row + for colname in table1.colnames: + # self.assertEqual(row1[colname], table2[nrow][colname)) + self.assertEqual(row1[colname], + table2.read(nrow, field=colname)[0]) + + # Assert other properties in table + self.assertEqual(table1.nrows, table2.nrows) + self.assertEqual(table1.shape, table2.shape) + self.assertEqual(table1.colnames, table2.colnames) + self.assertEqual(table1.coldtypes, table2.coldtypes) + self.assertEqualColinstances(table1, table2) + self.assertEqual(repr(table1.description), repr(table2.description)) + # Check alignment + if self.aligned and self.open_kwargs['allow_padding'] is True: + self.assertEqual(table1.description._v_offsets, [0, 8, 12]) + self.assertEqual(table1.description._v_itemsize, 16) + else: + self.assertEqual(table1.description._v_offsets, [0, 8, 11]) + self.assertEqual(table1.description._v_itemsize, 15) + self.assertEqual(table1.description._v_offsets, + table2.description._v_offsets) + self.assertEqual(table1.description._v_itemsize, + table2.description._v_itemsize) + + # Leaf attributes + self.assertEqual(table1.title, table2.title) + self.assertEqual(6, table2.filters.complevel) + self.assertEqual(1, table2.filters.shuffle) + self.assertEqual(table1.filters.fletcher32, table2.filters.fletcher32) + # User attributes + self.assertEqual(hasattr(table2.attrs, "attr1"), 0) + self.assertEqual(hasattr(table2.attrs, "attr2"), 0) + + +class CloseCopyTestCase(CopyTestCase): + close = True + aligned = False + open_kwargs = {'allow_padding': False} + + +class OpenCopyTestCase(CopyTestCase): + close = False + aligned = False + open_kwargs = {'allow_padding': True} + + +class AlignedCloseCopyTestCase(CopyTestCase): + close = True + aligned = True + open_kwargs = {'allow_padding': False} + + +class AlignedOpenCopyTestCase(CopyTestCase): + close = False + aligned = True + open_kwargs = {'allow_padding': True} + + +class AlignedNoPaddingOpenCopyTestCase(CopyTestCase): + close = False + aligned = True + open_kwargs = {'allow_padding': False} + + +class CopyIndexTestCase(common.TempFileMixin, common.PyTablesTestCase): + def test01_index(self): + """Checking Table.copy() method with indexes.""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test01_index..." % self.__class__.__name__) + + # Create a recarray exceeding buffers capability + r = np.rec.array(b'aaaabbbbccccddddeeeeffffgggg' * 200, + formats='2i2, (1,)i4, (2,3)u2, (1,)f4, (1,)f8', + shape=10) + # The line below exposes a bug in numpy + # formats='2i2, i4, (2,3)u2, f4, f8',shape=10) + # Save it in a table: + table1 = self.h5file.create_table(self.h5file.root, 'table1', r, + "title table1") + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen(mode='a') + table1 = self.h5file.root.table1 + + # Copy to another table + table1.nrowsinbuf = self.nrowsinbuf + table2 = table1.copy("/", 'table2', + start=self.start, + stop=self.stop, + step=self.step) + if common.verbose: + print("table1-->", table1.read()) + print("table2-->", table2.read()) + print("attrs table1-->", repr(table1.attrs)) + print("attrs table2-->", repr(table2.attrs)) + + # Check that all the elements are equal + r2 = r[self.start:self.stop:self.step] + for nrow in range(r2.shape[0]): + for colname in table1.colnames: + self.assertTrue(common.allequal( + r2[nrow][colname], table2[nrow][colname])) + + # Assert the number of rows in table + if common.verbose: + print("nrows in table2-->", table2.nrows) + print("and it should be-->", r2.shape[0]) + self.assertEqual(r2.shape[0], table2.nrows) + + def test02_indexclosef(self): + """Checking Table.copy() method with indexes (close file version)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test02_indexclosef..." % self.__class__.__name__) + + # Create a recarray exceeding buffers capability + r = np.rec.array(b'aaaabbbbccccddddeeeeffffgggg' * 200, + formats='2i2, i4, (2,3)u2, f4, f8', shape=10) + # Save it in a table: + table1 = self.h5file.create_table(self.h5file.root, 'table1', r, + "title table1") + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen(mode='a') + table1 = self.h5file.root.table1 + + # Copy to another table + table1.nrowsinbuf = self.nrowsinbuf + table2 = table1.copy("/", 'table2', + start=self.start, + stop=self.stop, + step=self.step) + + self._reopen() + table1 = self.h5file.root.table1 + table2 = self.h5file.root.table2 + + if common.verbose: + print("table1-->", table1.read()) + print("table2-->", table2.read()) + print("attrs table1-->", repr(table1.attrs)) + print("attrs table2-->", repr(table2.attrs)) + + # Check that all the elements are equal + r2 = r[self.start:self.stop:self.step] + for nrow in range(r2.shape[0]): + for colname in table1.colnames: + self.assertTrue(common.allequal( + r2[nrow][colname], table2[nrow][colname])) + + # Assert the number of rows in table + if common.verbose: + print("nrows in table2-->", table2.nrows) + print("and it should be-->", r2.shape[0]) + self.assertEqual(r2.shape[0], table2.nrows) + + +class CopyIndex1TestCase(CopyIndexTestCase): + nrowsinbuf = 2 + close = 1 + start = 0 + stop = 7 + step = 1 + + +class CopyIndex2TestCase(CopyIndexTestCase): + nrowsinbuf = 2 + close = 0 + start = 0 + stop = -1 + step = 1 + + +class CopyIndex3TestCase(CopyIndexTestCase): + nrowsinbuf = 3 + close = 1 + start = 1 + stop = 7 + step = 1 + + +class CopyIndex4TestCase(CopyIndexTestCase): + nrowsinbuf = 4 + close = 0 + start = 0 + stop = 6 + step = 1 + + +class CopyIndex5TestCase(CopyIndexTestCase): + nrowsinbuf = 2 + close = 1 + start = 3 + stop = 7 + step = 1 + + +class CopyIndex6TestCase(CopyIndexTestCase): + nrowsinbuf = 2 + close = 0 + start = 3 + stop = 6 + step = 2 + + +class CopyIndex7TestCase(CopyIndexTestCase): + nrowsinbuf = 2 + close = 1 + start = 0 + stop = 7 + step = 10 + + +class CopyIndex8TestCase(CopyIndexTestCase): + nrowsinbuf = 2 + close = 0 + start = 6 + stop = 3 + step = 1 + + +class CopyIndex9TestCase(CopyIndexTestCase): + nrowsinbuf = 2 + close = 1 + start = 3 + stop = 4 + step = 1 + + +class CopyIndex10TestCase(CopyIndexTestCase): + nrowsinbuf = 1 + close = 0 + start = 3 + stop = 4 + step = 2 + + +class CopyIndex11TestCase(CopyIndexTestCase): + nrowsinbuf = 2 + close = 1 + start = -3 + stop = -1 + step = 2 + + +class CopyIndex12TestCase(CopyIndexTestCase): + nrowsinbuf = 3 + close = 0 + start = -1 # Should point to the last element + stop = None # None should mean the last element (including it) + step = 1 + + +class LargeRowSize(common.TempFileMixin, common.PyTablesTestCase): + def test00(self): + """Checking saving a Table with a moderately large rowsize""" + + # Create a recarray + r = np.rec.array([(np.arange(100)) * 2]) + + # Save it in a table: + self.h5file.create_table(self.h5file.root, 'largerow', r) + + # Read it again + r2 = self.h5file.root.largerow.read() + + self.assertEqual(r.tobytes(), r2.tobytes()) + + def test01(self): + """Checking saving a Table with an extremely large rowsize""" + + # Create a recarray (1.4 MB rowsize) + r = np.zeros(10, dtype=np.dtype('(300,100)i4,(400,400)f8')) + # From PyTables 1.3 on, we allow row sizes equal or larger than 640 KB + self.h5file.create_table(self.h5file.root, 'largerow', r) + + # Read it again + r2 = self.h5file.root.largerow.read() + self.assertEqual(r.tobytes(), r2.tobytes()) + + +class DefaultValues(common.TempFileMixin, common.PyTablesTestCase): + record = Record + + def test00(self): + """Checking saving a Table with default values (using the same Row)""" + + # Create a table + table = self.h5file.create_table(self.h5file.root, 'table', + self.record) + + table.nrowsinbuf = 46 # minimum amount that reproduces a problem + # Take a number of records a bit greater + nrows = int(table.nrowsinbuf * 1.1) + row = table.row + # Fill the table with nrows records + for i in range(nrows): + if i == 3: + row['var2'] = 2 + if i == 4: + row['var3'] = 3 + # This injects the row values. + row.append() + + # We need to flush the buffers in table in order to get an + # accurate number of records on it. + table.flush() + + # Create a recarray with the same default values + values = [b"abcd", 1, 2, 3.1, 4.2, 5, "e", 1, 1j, 1 + 0j] + formats = 'a4,i4,i2,f8,f4,u2,a1,b1,c8,c16'.split(',') + + if hasattr(tb, 'Float16Col'): + values.append(6.4) + formats.append('f2') + if hasattr(tb, 'Float96Col'): + values.append(6.4) + formats.append('f12') + if hasattr(tb, 'Float128Col'): + values.append(6.4) + formats.append('f16') + if hasattr(tb, 'Complex192Col'): + values.append(1.-0.j) + formats.append('c24') + if hasattr(tb, 'Complex256Col'): + values.append(1.-0.j) + formats.append('c32') + + r = np.rec.array([tuple(values)] * nrows, formats=','.join(formats)) + + # Assign the value exceptions + r["f1"][3] = 2 + r["f2"][4] = 3 + + # Read the table in another recarray + # r2 = table.read() + r2 = table[::] # Equivalent to table.read() + + # This generates too much output. Activate only when + # self.nrowsinbuf is very small (<10) + if common.verbose: + print("First 10 table values:") + for row in table.iterrows(0, 10): + print(row) + print("The first 5 read recarray values:") + print(r2[:5]) + print("Records should look like:") + print(r[:5]) + + for name1, name2 in zip(r.dtype.names, r2.dtype.names): + self.assertTrue(common.allequal(r[name1], r2[name2])) + + # The following can give false errors when columns with extended + # precision data type are present in the record. + # It is probably due to some difference in the value of bits used + # for patting (longdoubles use just 80 bits but are stored in 96 or + # 128 bits in numpy arrays) + # self.assertEqual(r.tobytes(), r2.tobytes()) + + def test01(self): + """Checking saving a Table with default values (using different Row)""" + + # Create a table + table = self.h5file.create_table(self.h5file.root, 'table', + self.record) + + table.nrowsinbuf = 46 # minimum amount that reproduces a problem + # Take a number of records a bit greater + nrows = int(table.nrowsinbuf * 1.1) + # Fill the table with nrows records + for i in range(nrows): + if i == 3: + table.row['var2'] = 2 + if i == 4: + table.row['var3'] = 3 + # This injects the row values. + table.row.append() + + # We need to flush the buffers in table in order to get an + # accurate number of records on it. + table.flush() + + # Create a recarray with the same default values + values = [b"abcd", 1, 2, 3.1, 4.2, 5, "e", 1, 1j, 1 + 0j] + formats = 'a4,i4,i2,f8,f4,u2,a1,b1,c8,c16'.split(',') + + if hasattr(tb, 'Float16Col'): + values.append(6.4) + formats.append('f2') + if hasattr(tb, 'Float96Col'): + values.append(6.4) + formats.append('f12') + if hasattr(tb, 'Float128Col'): + values.append(6.4) + formats.append('f16') + if hasattr(tb, 'Complex192Col'): + values.append(1.-0.j) + formats.append('c24') + if hasattr(tb, 'Complex256Col'): + values.append(1.-0.j) + formats.append('c32') + + r = np.rec.array([tuple(values)] * nrows, formats=','.join(formats)) + + # Assign the value exceptions + r["f1"][3] = 2 + r["f2"][4] = 3 + + # Read the table in another recarray + # r2 = table.read() + r2 = table[::] # Equivalent to table.read() + + # This generates too much output. Activate only when + # self.nrowsinbuf is very small (<10) + if common.verbose: + print("First 10 table values:") + for row in table.iterrows(0, 10): + print(row) + print("The first 5 read recarray values:") + print(r2[:5]) + print("Records should look like:") + print(r[:5]) + + for name1, name2 in zip(r.dtype.names, r2.dtype.names): + self.assertTrue(common.allequal(r[name1], r2[name2])) + + # The following can give false errors when columns with extended + # precision data type are present in the record. + # It is probably due to some difference in the value of bits used + # for patting (longdoubles use just 80 bits but are stored in 96 or + # 128 bits in numpy arrays) + # self.assertEqual(r.tobytes(), r2.tobytes()) + + +class OldRecordDefaultValues(DefaultValues): + title = "OldRecordDefaultValues" + record = OldRecord + + +class Record2(tb.IsDescription): + var1 = tb.StringCol(itemsize=4, dflt=b"abcd") # 4-character String + var2 = tb.IntCol(dflt=1) # integer + var3 = tb.Int16Col(dflt=2) # short integer + var4 = tb.Float64Col(dflt=3.1) # double (double-precision) + + +class LengthTestCase(common.TempFileMixin, common.PyTablesTestCase): + record = Record + nrows = 20 + + def setUp(self): + super().setUp() + + # Create an instance of an HDF5 Table + self.rootgroup = self.h5file.root + self.populateFile() + + def populateFile(self): + # Create a table + table = self.h5file.create_table(self.h5file.root, 'table', + self.record, title="__length__ test") + # Get the row object associated with the new table + row = table.row + + # Fill the table + for i in range(self.nrows): + row.append() + + # Flush the buffer for this table + table.flush() + self.table = table + + def test01_lengthrows(self): + """Checking __length__ in Table.""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test01_lengthrows..." % self.__class__.__name__) + + # Number of rows + len(self.table) == self.nrows + + def test02_lengthcols(self): + """Checking __length__ in Cols.""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test02_lengthcols..." % self.__class__.__name__) + + # Number of columns + if self.record is Record: + len(self.table.cols) == 8 + elif self.record is Record2: + len(self.table.cols) == 4 + + def test03_lengthcol(self): + """Checking __length__ in Column.""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test03_lengthcol..." % self.__class__.__name__) + + # Number of rows for all columns column + for colname in self.table.colnames: + len(getattr(self.table.cols, colname)) == self.nrows + + +class Length1TestCase(LengthTestCase): + record = Record + nrows = 20 + + +class Length2TestCase(LengthTestCase): + record = Record2 + nrows = 100 + + +class WhereAppendTestCase(common.TempFileMixin, common.PyTablesTestCase): + """Tests `Table.append_where()` method.""" + + class SrcTblDesc(tb.IsDescription): + id = tb.IntCol() + v1 = tb.FloatCol() + v2 = tb.StringCol(itemsize=8) + + def setUp(self): + super().setUp() + + tbl = self.h5file.create_table('/', 'test', self.SrcTblDesc) + row = tbl.row + + row['id'] = 1 + row['v1'] = 1.5 + row['v2'] = 'a' * 8 + row.append() + + row['id'] = 2 + row['v1'] = 2.5 + row['v2'] = 'b' * 6 + row.append() + + tbl.flush() + + def test00_same(self): + """Query with same storage.""" + + DstTblDesc = self.SrcTblDesc + + tbl1 = self.h5file.root.test + tbl2 = self.h5file.create_table('/', 'test2', DstTblDesc) + + tbl1.append_where(tbl2, 'id > 1') + + # Rows resulting from the query are those in the new table. + it2 = iter(tbl2) + for r1 in tbl1.where('id > 1'): + r2 = next(it2) + self.assertTrue(r1['id'] == r2['id'] and r1['v1'] == r2['v1'] + and r1['v2'] == r2['v2']) + + # There are no more rows. + self.assertRaises(StopIteration, next, it2) + + def test01_compatible(self): + """Query with compatible storage.""" + + class DstTblDesc(tb.IsDescription): + id = tb.FloatCol() # float, not int + v1 = tb.FloatCol() + v2 = tb.StringCol(itemsize=16) # a longer column + v3 = tb.FloatCol() # extra column + + tbl1 = self.h5file.root.test + tbl2 = self.h5file.create_table('/', 'test2', DstTblDesc) + + tbl1.append_where(tbl2, 'id > 1') + + # Rows resulting from the query are those in the new table. + it2 = iter(tbl2) + for r1 in tbl1.where('id > 1'): + r2 = next(it2) + self.assertTrue(r1['id'] == r2['id'] and r1['v1'] == r2['v1'] + and r1['v2'] == r2['v2']) + + # There are no more rows. + self.assertRaises(StopIteration, next, it2) + + def test02_lessPrecise(self): + """Query with less precise storage.""" + + class DstTblDesc(tb.IsDescription): + id = tb.IntCol() + v1 = tb.IntCol() # int, not float + v2 = tb.StringCol(itemsize=8) + + tbl1 = self.h5file.root.test + tbl2 = self.h5file.create_table('/', 'test2', DstTblDesc) + + tbl1.append_where(tbl2, 'id > 1') + + # Rows resulting from the query are those in the new table. + it2 = iter(tbl2) + for r1 in tbl1.where('id > 1'): + r2 = next(it2) + self.assertTrue(r1['id'] == r2['id'] and int(r1['v1']) == r2['v1'] + and r1['v2'] == r2['v2']) + + # There are no more rows. + self.assertRaises(StopIteration, next, it2) + + def test03_incompatible(self): + """Query with incompatible storage.""" + + class DstTblDesc(tb.IsDescription): + id = tb.StringCol(itemsize=4) # string, not int + v1 = tb.FloatCol() + v2 = tb.StringCol(itemsize=8) + + tbl1 = self.h5file.root.test + tbl2 = self.h5file.create_table('/', 'test2', DstTblDesc) + + self.assertRaises(NotImplementedError, + tbl1.append_where, tbl2, 'v1 == b"1"') + + def test04_noColumn(self): + """Query with storage lacking columns.""" + + class DstTblDesc(tb.IsDescription): + # no ``id`` field + v1 = tb.FloatCol() + v2 = tb.StringCol(itemsize=8) + + tbl1 = self.h5file.root.test + tbl2 = self.h5file.create_table('/', 'test2', DstTblDesc) + + self.assertRaises(KeyError, tbl1.append_where, tbl2, 'id > 1') + + def test05_otherFile(self): + """Appending to a table in another file.""" + + h5fname2 = tempfile.mktemp(suffix='.h5') + + try: + with tb.open_file(h5fname2, 'w') as h5file2: + tbl1 = self.h5file.root.test + tbl2 = h5file2.create_table('/', 'test', self.SrcTblDesc) + + # RW to RW. + tbl1.append_where(tbl2, 'id > 1') + + # RW to RO. + with tb.open_file(h5fname2, 'r') as h5file2: + tbl2 = h5file2.root.test + self.assertRaises(tb.FileModeError, + tbl1.append_where, tbl2, 'id > 1') + + # RO to RO. + self._reopen('r') + tbl1 = self.h5file.root.test + self.assertRaises(tb.FileModeError, + tbl1.append_where, tbl2, 'id > 1') + + # RO to RW. + with tb.open_file(h5fname2, 'a') as h5file2: + tbl2 = h5file2.root.test + tbl1.append_where(tbl2, 'id > 1') + finally: + if Path(h5fname2).is_file(): + Path(h5fname2).unlink() + + def test06_wholeTable(self): + """Append whole table.""" + + DstTblDesc = self.SrcTblDesc + + tbl1 = self.h5file.root.test + tbl2 = self.h5file.create_table('/', 'test2', DstTblDesc) + + tbl1.append_where(tbl2) + + # Rows resulting from the query are those in the new table. + it2 = iter(tbl2) + for r1 in tbl1.__iter__(): + r2 = next(it2) + self.assertTrue(r1['id'] == r2['id'] and r1['v1'] == r2['v1'] + and r1['v2'] == r2['v2']) + + # There are no more rows. + self.assertRaises(StopIteration, next, it2) + + +class DerivedTableTestCase(common.TempFileMixin, common.PyTablesTestCase): + def setUp(self): + super().setUp() + self.h5file.create_table('/', 'original', Record) + + def test00(self): + """Deriving a table from the description of another.""" + + tbl1 = self.h5file.root.original + tbl2 = self.h5file.create_table('/', 'derived', tbl1.description) + + self.assertEqual(tbl1.description, tbl2.description) + + +class ChunkshapeTestCase(common.TempFileMixin, common.PyTablesTestCase): + def setUp(self): + super().setUp() + self.h5file.create_table('/', 'table', Record, chunkshape=13) + + def test00(self): + """Test setting the chunkshape in a table (no reopen).""" + + tbl = self.h5file.root.table + if common.verbose: + print("chunkshape-->", tbl.chunkshape) + self.assertEqual(tbl.chunkshape, (13,)) + + def test01(self): + """Test setting the chunkshape in a table (reopen).""" + + self.h5file.close() + self.h5file = tb.open_file(self.h5fname, 'r') + tbl = self.h5file.root.table + if common.verbose: + print("chunkshape-->", tbl.chunkshape) + self.assertEqual(tbl.chunkshape, (13,)) + + +# Test for appending zero-sized recarrays +class ZeroSizedTestCase(common.TempFileMixin, common.PyTablesTestCase): + def setUp(self): + super().setUp() + + # Create a Table + t = self.h5file.create_table('/', 'table', {'c1': tb.Int32Col(), + 'c2': tb.Float64Col()}) + # Append a single row + t.append([(1, 2.2)]) + + def test01_canAppend(self): + """Appending zero length recarray.""" + + t = self.h5file.root.table + a = np.empty(shape=(0,), dtype='i4,f8') + t.append(a) + self.assertEqual(t.nrows, 1, "The number of rows should be 1.") + + +# Case for testing ticket #103, i.e. selections in columns which are +# aligned but that its data length is not an exact multiple of the +# length of the record. This exposes the problem only in 32-bit +# machines, because in 64-bit machine, 'c2' is unaligned. However, +# this should check most platforms where, while not unaligned, +# len(datatype) > boundary_alignment is fullfilled. +class IrregularStrideTestCase(common.TempFileMixin, common.PyTablesTestCase): + def setUp(self): + super().setUp() + + class IRecord(tb.IsDescription): + c1 = tb.Int32Col(pos=1) + c2 = tb.Float64Col(pos=2) + + table = self.h5file.create_table('/', 'table', IRecord) + for i in range(10): + table.row['c1'] = i + table.row['c2'] = i + table.row.append() + table.flush() + + def test00(self): + """Selecting rows in a table with irregular stride (but aligned).""" + + table = self.h5file.root.table + coords1 = table.get_where_list('c1<5') + coords2 = table.get_where_list('c2<5') + if common.verbose: + print("\nSelected coords1-->", coords1) + print("Selected coords2-->", coords2) + self.assertTrue( + common.allequal(coords1, np.arange(5, dtype=tb.utils.SizeType))) + self.assertTrue( + common.allequal(coords2, np.arange(5, dtype=tb.utils.SizeType))) + + +class Issue262TestCase(common.TempFileMixin, common.PyTablesTestCase): + def setUp(self): + super().setUp() + + class IRecord(tb.IsDescription): + c1 = tb.Int32Col(pos=1) + c2 = tb.Float64Col(pos=2) + + table = self.h5file.create_table('/', 'table', IRecord) + table.nrowsinbuf = 3 + + for i in range(20): + table.row['c1'] = i + table.row['c2'] = i + table.row.append() + + table.row['c1'] = i % 29 + table.row['c2'] = 300 - i + table.row.append() + + table.row['c1'] = 300 - i + table.row['c2'] = 100 + i % 30 + table.row.append() + + table.flush() + + def test_gh260(self): + """Regression test for gh-260""" + + table = self.h5file.root.table + coords1 = table.get_where_list('(c1>5)&(c2<30)', start=0, step=2) + coords2 = table.get_where_list('(c1>5)&(c2<30)', start=1, step=2) + data = table.read() + data = data[np.where((data['c1'] > 5) & (data['c2'] < 30))] + + if common.verbose: + print() + print("Selected coords1-->", coords1) + print("Selected coords2-->", coords2) + print("Selected data-->", data) + self.assertEqual(len(coords1) + len(coords2), len(data)) + + def test_gh262_01(self): + """Regression test for gh-262 (start=0, step=1)""" + + table = self.h5file.root.table + data = table.get_where_list('(c1>5)&(~(c1>5))', start=0, step=1) + + if common.verbose: + print() + print("data -->", data) + self.assertEqual(len(data), 0) + + def test_gh262_02(self): + """Regression test for gh-262 (start=1, step=1)""" + + table = self.h5file.root.table + data = table.get_where_list('(c1>5)&(~(c1>5))', start=1, step=1) + + if common.verbose: + print() + print("data -->", data) + self.assertEqual(len(data), 0) + + def test_gh262_03(self): + """Regression test for gh-262 (start=0, step=2)""" + + table = self.h5file.root.table + data = table.get_where_list('(c1>5)&(~(c1>5))', start=0, step=2) + + if common.verbose: + print() + print("data -->", data) + self.assertEqual(len(data), 0) + + def test_gh262_04(self): + """Regression test for gh-262 (start=1, step=2)""" + + table = self.h5file.root.table + data = table.get_where_list('(c1>5)&(~(c1>5))', start=1, step=2) + + if common.verbose: + print() + print("data -->", data) + self.assertEqual(len(data), 0) + + +class TruncateTestCase(common.TempFileMixin, common.PyTablesTestCase): + def setUp(self): + super().setUp() + + table = self.h5file.create_table('/', 'table', self.IRecord) + # Fill just a couple of rows + for i in range(2): + table.row['c1'] = i + table.row['c2'] = i + table.row.append() + table.flush() + # The defaults + self.dflts = table.coldflts + + def test00_truncate(self): + """Checking Table.truncate() method (truncating to 0 rows)""" + + table = self.h5file.root.table + # Truncate to 0 elements + table.truncate(0) + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen() + table = self.h5file.root.table + + if common.verbose: + print("table-->", table.read()) + + self.assertEqual(table.nrows, 0) + for row in table: + self.assertEqual(row['c1'], row.nrow) + + def test01_truncate(self): + """Checking Table.truncate() method (truncating to 1 rows)""" + + table = self.h5file.root.table + # Truncate to 1 element + table.truncate(1) + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen() + table = self.h5file.root.table + + if common.verbose: + print("table-->", table.read()) + + self.assertEqual(table.nrows, 1) + for row in table: + self.assertEqual(row['c1'], row.nrow) + + def test02_truncate(self): + """Checking Table.truncate() method (truncating to == self.nrows)""" + + table = self.h5file.root.table + # Truncate to 2 elements + table.truncate(2) + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen() + table = self.h5file.root.table + + if common.verbose: + print("table-->", table.read()) + + self.assertEqual(table.nrows, 2) + for row in table: + self.assertEqual(row['c1'], row.nrow) + + def test03_truncate(self): + """Checking Table.truncate() method (truncating to > self.nrows)""" + + table = self.h5file.root.table + # Truncate to 4 elements + table.truncate(4) + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen() + table = self.h5file.root.table + + if common.verbose: + print("table-->", table.read()) + + self.assertEqual(table.nrows, 4) + # Check the original values + for row in table.iterrows(start=0, stop=2): + self.assertEqual(row['c1'], row.nrow) + # Check that the added rows have the default values + for row in table.iterrows(start=2, stop=4): + self.assertEqual(row['c1'], self.dflts['c1']) + self.assertEqual(row['c2'], self.dflts['c2']) + + +class TruncateOpen1(TruncateTestCase): + class IRecord(tb.IsDescription): + c1 = tb.Int32Col(pos=1) + c2 = tb.FloatCol(pos=2) + close = 0 + + +class TruncateOpen2(TruncateTestCase): + class IRecord(tb.IsDescription): + c1 = tb.Int32Col(pos=1, dflt=3) + c2 = tb.FloatCol(pos=2, dflt=-3.1) + close = 0 + + +class TruncateClose1(TruncateTestCase): + class IRecord(tb.IsDescription): + c1 = tb.Int32Col(pos=1) + c2 = tb.FloatCol(pos=2) + close = 1 + + +class TruncateClose2(TruncateTestCase): + class IRecord(tb.IsDescription): + c1 = tb.Int32Col(pos=1, dflt=4) + c2 = tb.FloatCol(pos=2, dflt=3.1) + close = 1 + + +class PointSelectionTestCase(common.TempFileMixin, common.PyTablesTestCase): + def setUp(self): + super().setUp() + + N = 100 + + self.working_keyset = [ + [0, 1], + [0, -1], + ] + self.not_working_keyset = [ + [0, N], + [0, N+1], + [0, -N-1], + ] + + # Limits for selections + self.limits = [ + (0, 1), # just one element + (20, -10), # no elements + (-10, 4), # several elements + (0, 10), # several elements (again) + ] + + # Create a sample tables + self.data = data = np.arange(N) + self.recarr = recarr = np.empty(N, dtype="i4,f4") + recarr["f0"][:] = data + recarr["f1"][:] = data + self.table = self.h5file.create_table(self.h5file.root, 'table', + recarr) + + def test01a_read(self): + """Test for point-selections (read, boolean keys).""" + + data = self.data + recarr = self.recarr + table = self.table + for value1, value2 in self.limits: + key = (data >= value1) & (data < value2) + if common.verbose: + print("Selection to test:", key) + a = recarr[key] + b = table[key] + if common.verbose: + print("NumPy selection:", a) + print("PyTables selection:", b) + np.testing.assert_array_equal( + a, b, "NumPy array and PyTables selections does not match.") + + def test01b_read(self): + """Test for point-selections (read, tuples of integers keys).""" + + data = self.data + recarr = self.recarr + table = self.table + for value1, value2 in self.limits: + key = np.where((data >= value1) & (data < value2)) + if common.verbose: + print("Selection to test:", key, type(key)) + a = recarr[key] + b = table[key] + np.testing.assert_array_equal( + a, b, "NumPy array and PyTables selections does not match.") + + def test01c_read(self): + """Test for point-selections (read, tuples of floats keys).""" + + data = self.data + recarr = self.recarr + table = self.table + for value1, value2 in self.limits: + key = np.where((data >= value1) & (data < value2)) + if common.verbose: + print("Selection to test:", key) + recarr[key] + fkey = np.array(key, "f4") + self.assertRaises(TypeError, table.__getitem__, fkey) + + def test01d_read(self): + """Test for point-selections (read, numpy keys).""" + + data = self.data + recarr = self.recarr + table = self.table + for value1, value2 in self.limits: + key = np.where((data >= value1) & (data < value2))[0] + if common.verbose: + print("Selection to test:", key, type(key)) + a = recarr[key] + b = table[key] + np.testing.assert_array_equal( + a, b, "NumPy array and PyTables selections does not match.") + + def test01e_read(self): + """Test for point-selections (read, list keys).""" + + data = self.data + recarr = self.recarr + table = self.table + for value1, value2 in self.limits: + key = np.where((data >= value1) & (data < value2))[0].tolist() + if common.verbose: + print("Selection to test:", key, type(key)) + a = recarr[key] + b = table[key] + np.testing.assert_array_equal( + a, b, "NumPy array and PyTables selections does not match.") + + def test01f_read(self): + recarr = self.recarr + table = self.table + + for key in self.working_keyset: + if common.verbose: + print("Selection to test:", key) + a = recarr[key] + b = table[key] + np.testing.assert_array_equal( + a, b, "NumPy array and PyTables selections does not match.") + + def test01g_read(self): + table = self.table + + for key in self.not_working_keyset: + if common.verbose: + print("Selection to test:", key) + + self.assertRaises(IndexError, table.__getitem__, key) + + def test02a_write(self): + """Test for point-selections (write, boolean keys).""" + + data = self.data + recarr = self.recarr + table = self.table + for value1, value2 in self.limits: + key = np.where((data >= value1) & (data < value2)) + if common.verbose: + print("Selection to test:", key) + s = recarr[key] + # Modify the s recarray + s["f0"][:] = data[:len(s)]*2 + s["f1"][:] = data[:len(s)]*3 + # Modify recarr and table + recarr[key] = s + table[key] = s + a = recarr[:] + b = table[:] + np.testing.assert_array_equal( + a, b, "NumPy array and PyTables modifications does not match.") + + def test02b_write(self): + """Test for point-selections (write, integer keys).""" + + data = self.data + recarr = self.recarr + table = self.table + for value1, value2 in self.limits: + key = np.where((data >= value1) & (data < value2)) + if common.verbose: + print("Selection to test:", key) + s = recarr[key] + # Modify the s recarray + s["f0"][:] = data[:len(s)]*2 + s["f1"][:] = data[:len(s)]*3 + # Modify recarr and table + recarr[key] = s + table[key] = s + a = recarr[:] + b = table[:] + np.testing.assert_array_equal( + a, b, "NumPy array and PyTables modifications does not match.") + + +# Test for building very large MD columns without defaults +class MDLargeColTestCase(common.TempFileMixin, common.PyTablesTestCase): + def test01_create(self): + """Create a Table with a very large MD column. Ticket #211.""" + N = 2**18 # 4x larger than maximum object header size (64 KB) + cols = {'col1': tb.Int8Col(shape=N, dflt=0)} + tbl = self.h5file.create_table('/', 'test', cols) + tbl.row.append() # add a single row + tbl.flush() + if self.reopen: + self._reopen('a') + tbl = self.h5file.root.test + # Check the value + if common.verbose: + print("First row-->", tbl[0]['col1']) + np.testing.assert_array_equal(tbl[0]['col1'], np.zeros(N, 'i1')) + + +class MDLargeColNoReopen(MDLargeColTestCase): + reopen = False + + +class MDLargeColReopen(MDLargeColTestCase): + reopen = True + + +# Test with itertools.groupby that iterates on exhausted Row iterator +# See ticket #264. +class ExhaustedIter(common.TempFileMixin, common.PyTablesTestCase): + def setUp(self): + super().setUp() + + class Observations(tb.IsDescription): + market_id = tb.IntCol(pos=0) + scenario_id = tb.IntCol(pos=1) + value = tb.Float32Col(pos=3) + + table = self.h5file.create_table('/', 'observations', Observations, + chunkshape=32) + + # fill the database + observations = np.arange(225) + row = table.row + for market_id in range(5): + for scenario_id in range(3): + for obs in observations: + row['market_id'] = market_id + row['scenario_id'] = scenario_id + row['value'] = obs + row.append() + table.flush() + + def average(self, values): + return sum(values, 0.0) / len(values) + + def f_scenario(self, row): + return row['scenario_id'] + + def test00_groupby(self): + """Checking iterating an exhausted iterator (ticket #264)""" + rows = self.h5file.root.observations.where('(market_id == 3)') + scenario_means = [] + for scenario_id, rows_grouped in itertools.groupby(rows, + self.f_scenario): + vals = [row['value'] for row in rows_grouped] + scenario_means.append(self.average(vals)) + if common.verbose: + print('Means -->', scenario_means) + self.assertEqual(scenario_means, [112.0, 112.0, 112.0]) + + def test01_groupby(self): + """Checking iterating an exhausted iterator (ticket #264). Reopen.""" + + self._reopen() + + rows = self.h5file.root.observations.where('(market_id == 3)') + scenario_means = [] + for scenario_id, rows_grouped in itertools.groupby(rows, + self.f_scenario): + vals = [row['value'] for row in rows_grouped] + scenario_means.append(self.average(vals)) + if common.verbose: + print('Means -->', scenario_means) + self.assertEqual(scenario_means, [112.0, 112.0, 112.0]) + + +class SpecialColnamesTestCase(common.TempFileMixin, common.PyTablesTestCase): + def test00_check_names(self): + f = self.h5file + a = np.array([(1, 2, 3)], dtype=[( + "a", int), ("_b", int), ("__c", int)]) + t = f.create_table(f.root, "test", a) + self.assertEqual(len(t.colnames), 3, "Number of columns incorrect") + if common.verbose: + print("colnames -->", t.colnames) + for name, name2 in zip(t.colnames, ("a", "_b", "__c")): + self.assertEqual(name, name2) + + +class RowContainsTestCase(common.TempFileMixin, common.PyTablesTestCase): + def test00_row_contains(self): + f = self.h5file + a = np.array([(1, 2, 3)], dtype="i1,i2,i4") + t = f.create_table(f.root, "test", a) + row = [r for r in t.iterrows()][0] + if common.verbose: + print("row -->", row[:]) + for item in (1, 2, 3): + self.assertIn(item, row) + self.assertNotIn(4, row) + + +class AccessClosedTestCase(common.TempFileMixin, common.PyTablesTestCase): + def setUp(self): + super().setUp() + self.table = self.h5file.create_table( + self.h5file.root, 'table', Record) + + row = self.table.row + for i in range(10): + row['var1'] = '%04d' % i + row['var2'] = i + row['var3'] = i % 3 + row.append() + self.table.flush() + + def test_read(self): + self.h5file.close() + self.assertRaises( + tb.ClosedNodeError, self.table.read) + + def test_getitem(self): + self.h5file.close() + self.assertRaises( + tb.ClosedNodeError, self.table.__getitem__, 0) + + def test_setitem(self): + data = self.table[0] + self.h5file.close() + self.assertRaises( + tb.ClosedNodeError, self.table.__setitem__, 0, data) + + def test_append(self): + data = self.table[0] + self.h5file.close() + self.assertRaises( + tb.ClosedNodeError, self.table.append, data) + + def test_readWhere(self): + self.h5file.close() + self.assertRaises( + tb.ClosedNodeError, self.table.read_where, 'var2 > 3') + + def test_whereAppend(self): + self.h5file.close() + self.assertRaises( + tb.ClosedNodeError, + self.table.append_where, self.table, 'var2 > 3') + + def test_getWhereList(self): + self.h5file.close() + self.assertRaises( + tb.ClosedNodeError, self.table.get_where_list, 'var2 > 3') + + def test_readSorted(self): + self.h5file.close() + self.assertRaises( + tb.ClosedNodeError, self.table.read_sorted, 'var2') + + def test_readCoordinates(self): + self.h5file.close() + self.assertRaises( + tb.ClosedNodeError, self.table.read_coordinates, [2, 5]) + + +class ColumnIterationTestCase(common.TempFileMixin, common.PyTablesTestCase): + def setUp(self): + super().setUp() + self.buffer_size = self.h5file.params['IO_BUFFER_SIZE'] + + def create_non_nested_table(self, nrows, dtype): + array = np.empty((nrows, ), dtype) + for name in dtype.names: + array[name] = np.random.randint(0, 10_000, nrows) + table = self.h5file.create_table('/', 'table', dtype) + table.append(array) + return array, table + + def iterate(self, array, table): + row_num = 0 + for item in table.cols.f0: + self.assertEqual(item, array['f0'][row_num]) + row_num += 1 + self.assertEqual(row_num, len(array)) + + def test_less_than_io_buffer(self): + dtype = np.format_parser(['i8'] * 3, [], []).dtype + rows_in_buffer = self.buffer_size // dtype[0].itemsize + array, table = self.create_non_nested_table(rows_in_buffer // 2, dtype) + self.iterate(array, table) + + def test_more_than_io_buffer(self): + dtype = np.format_parser(['i8'] * 3, [], []).dtype + rows_in_buffer = self.buffer_size // dtype[0].itemsize + array, table = self.create_non_nested_table(rows_in_buffer * 3, dtype) + self.iterate(array, table) + + def test_partially_filled_buffer(self): + dtype = np.format_parser(['i8'] * 3, [], []).dtype + rows_in_buffer = self.buffer_size // dtype[0].itemsize + array, table = self.create_non_nested_table(rows_in_buffer * 2 + 2, + dtype) + self.iterate(array, table) + + def test_zero_length_table(self): + dtype = np.format_parser(['i8'] * 3, [], []).dtype + array, table = self.create_non_nested_table(0, dtype) + self.assertEqual(len(table), 0) + self.iterate(array, table) + + +class TestCreateTableArgs(common.TempFileMixin, common.PyTablesTestCase): + obj = np.array( + [('aaaa', 1, 2.1), ('bbbb', 2, 3.2)], + dtype=[('name', 'S4'), ('icol', np.int32), ('fcol', np.float32)]) + where = '/' + name = 'table' + description, _ = tb.description.descr_from_dtype(obj.dtype) + title = 'title' + filters = None + expectedrows = 10_000 + chunkshape = None + byteorder = None + createparents = False + + def test_positional_args_01(self): + self.h5file.create_table(self.where, self.name, + self.description, + self.title, self.filters, + self.expectedrows) + + self._reopen() + + ptarr = self.h5file.get_node(self.where, self.name) + + self.assertEqual(ptarr.title, self.title) + self.assertEqual(ptarr.shape, (0,)) + self.assertEqual(ptarr.nrows, 0) + self.assertEqual(tuple(ptarr.colnames), self.obj.dtype.names) + + def test_positional_args_02(self): + ptarr = self.h5file.create_table(self.where, self.name, + self.description, + self.title, + self.filters, + self.expectedrows) + ptarr.append(self.obj) + + self._reopen() + + ptarr = self.h5file.get_node(self.where, self.name) + nparr = ptarr.read() + + self.assertEqual(ptarr.title, self.title) + self.assertEqual(ptarr.shape, (len(self.obj),)) + self.assertEqual(ptarr.nrows, len(self.obj)) + self.assertEqual(tuple(ptarr.colnames), self.obj.dtype.names) + self.assertEqual(nparr.dtype, self.obj.dtype) + self.assertTrue(common.allequal(self.obj, nparr)) + + def test_positional_args_obj(self): + self.h5file.create_table(self.where, self.name, + None, + self.title, + self.filters, + self.expectedrows, + self.chunkshape, + self.byteorder, + self.createparents, + self.obj) + + self._reopen() + + ptarr = self.h5file.get_node(self.where, self.name) + nparr = ptarr.read() + + self.assertEqual(ptarr.title, self.title) + self.assertEqual(ptarr.shape, (len(self.obj),)) + self.assertEqual(ptarr.nrows, len(self.obj)) + self.assertEqual(tuple(ptarr.colnames), self.obj.dtype.names) + self.assertTrue(common.allequal(self.obj, nparr)) + + def test_kwargs_obj(self): + self.h5file.create_table(self.where, self.name, title=self.title, + obj=self.obj) + + self._reopen() + + ptarr = self.h5file.get_node(self.where, self.name) + nparr = ptarr.read() + + self.assertEqual(ptarr.title, self.title) + self.assertEqual(ptarr.shape, (len(self.obj),)) + self.assertEqual(ptarr.nrows, len(self.obj)) + self.assertEqual(tuple(ptarr.colnames), self.obj.dtype.names) + self.assertTrue(common.allequal(self.obj, nparr)) + + def test_kwargs_description_01(self): + ptarr = self.h5file.create_table(self.where, self.name, + title=self.title, + description=self.description) + ptarr.append(self.obj) + + self._reopen() + + ptarr = self.h5file.get_node(self.where, self.name) + nparr = ptarr.read() + + self.assertEqual(ptarr.title, self.title) + self.assertEqual(ptarr.shape, (len(self.obj),)) + self.assertEqual(ptarr.nrows, len(self.obj)) + self.assertEqual(tuple(ptarr.colnames), self.obj.dtype.names) + self.assertTrue(common.allequal(self.obj, nparr)) + + def test_kwargs_description_02(self): + ptarr = self.h5file.create_table(self.where, self.name, + title=self.title, + description=self.description) + # ptarr.append(self.obj) + self._reopen() + + ptarr = self.h5file.get_node(self.where, self.name) + + self.assertEqual(ptarr.title, self.title) + self.assertEqual(ptarr.shape, (0,)) + self.assertEqual(ptarr.nrows, 0) + self.assertEqual(tuple(ptarr.colnames), self.obj.dtype.names) + + def test_kwargs_obj_description(self): + ptarr = self.h5file.create_table(self.where, self.name, + title=self.title, + obj=self.obj, + description=self.description) + + self._reopen() + + ptarr = self.h5file.get_node(self.where, self.name) + nparr = ptarr.read() + + self.assertEqual(ptarr.title, self.title) + self.assertEqual(ptarr.shape, (len(self.obj),)) + self.assertEqual(ptarr.nrows, len(self.obj)) + self.assertEqual(tuple(ptarr.colnames), self.obj.dtype.names) + self.assertTrue(common.allequal(self.obj, nparr)) + + def test_kwargs_obj_description_error_01(self): + self.assertRaises(TypeError, + self.h5file.create_table, + self.where, + self.name, + title=self.title, + obj=self.obj, + description=Record) + + def test_kwargs_obj_description_error_02(self): + self.assertRaises(TypeError, + self.h5file.create_table, + self.where, + self.name, + title=self.title, + obj=self.obj, + description=Record()) + + def test_kwargs_obj_description_error_03(self): + self.assertRaises(TypeError, + self.h5file.create_table, + self.where, + self.name, + title=self.title, + obj=self.obj, + description=RecordDescriptionDict) + + +def suite(): + theSuite = common.unittest.TestSuite() + niter = 1 + # common.heavy = 1 # uncomment this only for testing purposes + + for n in range(niter): + theSuite.addTest(common.unittest.makeSuite(BasicWriteTestCase)) + theSuite.addTest( + common.unittest.makeSuite(OldRecordBasicWriteTestCase)) + theSuite.addTest(common.unittest.makeSuite(DictWriteTestCase)) + theSuite.addTest(common.unittest.makeSuite(NumPyDTWriteTestCase)) + theSuite.addTest(common.unittest.makeSuite(RecArrayOneWriteTestCase)) + theSuite.addTest(common.unittest.makeSuite(RecArrayTwoWriteTestCase)) + theSuite.addTest(common.unittest.makeSuite(RecArrayThreeWriteTestCase)) + theSuite.addTest( + common.unittest.makeSuite(RecArrayAlignedWriteTestCase)) + theSuite.addTest( + common.unittest.makeSuite(CompressBloscTablesTestCase)) + theSuite.addTest(common.unittest.makeSuite( + CompressBloscShuffleTablesTestCase)) + theSuite.addTest(common.unittest.makeSuite( + CompressBloscBitShuffleTablesTestCase)) + theSuite.addTest(common.unittest.makeSuite( + CompressBloscBloscLZTablesTestCase)) + theSuite.addTest( + common.unittest.makeSuite(CompressBloscLZ4TablesTestCase)) + theSuite.addTest( + common.unittest.makeSuite(CompressBloscLZ4HCTablesTestCase)) + theSuite.addTest( + common.unittest.makeSuite(CompressBloscSnappyTablesTestCase)) + theSuite.addTest( + common.unittest.makeSuite(CompressBloscZlibTablesTestCase)) + theSuite.addTest( + common.unittest.makeSuite(CompressBloscZstdTablesTestCase)) + theSuite.addTest(common.unittest.makeSuite(CompressLZOTablesTestCase)) + theSuite.addTest( + common.unittest.makeSuite(CompressLZOShuffleTablesTestCase)) + theSuite.addTest(common.unittest.makeSuite(CompressZLIBTablesTestCase)) + theSuite.addTest( + common.unittest.makeSuite(CompressZLIBShuffleTablesTestCase)) + theSuite.addTest(common.unittest.makeSuite(Fletcher32TablesTestCase)) + theSuite.addTest(common.unittest.makeSuite(AllFiltersTablesTestCase)) + theSuite.addTest(common.unittest.makeSuite(CompressTwoTablesTestCase)) + theSuite.addTest(common.unittest.makeSuite( + SizeOnDiskInMemoryPropertyTestCase)) + theSuite.addTest(common.unittest.makeSuite(NonNestedTableReadTestCase)) + theSuite.addTest(common.unittest.makeSuite(TableReadByteorderTestCase)) + theSuite.addTest(common.unittest.makeSuite(IterRangeTestCase)) + theSuite.addTest(common.unittest.makeSuite(RecArrayRangeTestCase)) + theSuite.addTest(common.unittest.makeSuite(GetColRangeTestCase)) + theSuite.addTest(common.unittest.makeSuite(GetItemTestCase)) + theSuite.addTest(common.unittest.makeSuite(SetItemTestCase1)) + theSuite.addTest(common.unittest.makeSuite(SetItemTestCase2)) + theSuite.addTest(common.unittest.makeSuite(SetItemTestCase3)) + theSuite.addTest(common.unittest.makeSuite(SetItemTestCase4)) + theSuite.addTest(common.unittest.makeSuite(UpdateRowTestCase1)) + theSuite.addTest(common.unittest.makeSuite(UpdateRowTestCase2)) + theSuite.addTest(common.unittest.makeSuite(UpdateRowTestCase3)) + theSuite.addTest(common.unittest.makeSuite(UpdateRowTestCase4)) + theSuite.addTest(common.unittest.makeSuite(RecArrayIO1)) + theSuite.addTest(common.unittest.makeSuite(RecArrayIO2)) + theSuite.addTest(common.unittest.makeSuite(OpenCopyTestCase)) + theSuite.addTest(common.unittest.makeSuite(CloseCopyTestCase)) + theSuite.addTest(common.unittest.makeSuite(AlignedOpenCopyTestCase)) + theSuite.addTest(common.unittest.makeSuite(AlignedCloseCopyTestCase)) + theSuite.addTest( + common.unittest.makeSuite(AlignedNoPaddingOpenCopyTestCase)) + theSuite.addTest(common.unittest.makeSuite(CopyIndex1TestCase)) + theSuite.addTest(common.unittest.makeSuite(CopyIndex2TestCase)) + theSuite.addTest(common.unittest.makeSuite(CopyIndex3TestCase)) + theSuite.addTest(common.unittest.makeSuite(CopyIndex4TestCase)) + theSuite.addTest(common.unittest.makeSuite(CopyIndex5TestCase)) + theSuite.addTest(common.unittest.makeSuite(CopyIndex6TestCase)) + theSuite.addTest(common.unittest.makeSuite(CopyIndex7TestCase)) + theSuite.addTest(common.unittest.makeSuite(CopyIndex8TestCase)) + theSuite.addTest(common.unittest.makeSuite(CopyIndex9TestCase)) + theSuite.addTest(common.unittest.makeSuite(DefaultValues)) + theSuite.addTest(common.unittest.makeSuite(OldRecordDefaultValues)) + theSuite.addTest(common.unittest.makeSuite(Length1TestCase)) + theSuite.addTest(common.unittest.makeSuite(Length2TestCase)) + theSuite.addTest(common.unittest.makeSuite(WhereAppendTestCase)) + theSuite.addTest(common.unittest.makeSuite(DerivedTableTestCase)) + theSuite.addTest(common.unittest.makeSuite(ChunkshapeTestCase)) + theSuite.addTest(common.unittest.makeSuite(ZeroSizedTestCase)) + theSuite.addTest(common.unittest.makeSuite(IrregularStrideTestCase)) + theSuite.addTest(common.unittest.makeSuite(Issue262TestCase)) + theSuite.addTest(common.unittest.makeSuite(TruncateOpen1)) + theSuite.addTest(common.unittest.makeSuite(TruncateOpen2)) + theSuite.addTest(common.unittest.makeSuite(TruncateClose1)) + theSuite.addTest(common.unittest.makeSuite(TruncateClose2)) + theSuite.addTest(common.unittest.makeSuite(PointSelectionTestCase)) + theSuite.addTest(common.unittest.makeSuite(MDLargeColNoReopen)) + theSuite.addTest(common.unittest.makeSuite(MDLargeColReopen)) + theSuite.addTest(common.unittest.makeSuite(ExhaustedIter)) + theSuite.addTest(common.unittest.makeSuite(SpecialColnamesTestCase)) + theSuite.addTest(common.unittest.makeSuite(RowContainsTestCase)) + theSuite.addTest(common.unittest.makeSuite(AccessClosedTestCase)) + theSuite.addTest(common.unittest.makeSuite(ColumnIterationTestCase)) + theSuite.addTest(common.unittest.makeSuite(TestCreateTableArgs)) + + if common.heavy: + theSuite.addTest( + common.unittest.makeSuite(CompressBzip2TablesTestCase)) + theSuite.addTest(common.unittest.makeSuite( + CompressBzip2ShuffleTablesTestCase)) + theSuite.addTest(common.unittest.makeSuite(CopyIndex10TestCase)) + theSuite.addTest(common.unittest.makeSuite(CopyIndex11TestCase)) + theSuite.addTest(common.unittest.makeSuite(CopyIndex12TestCase)) + theSuite.addTest(common.unittest.makeSuite(LargeRowSize)) + theSuite.addTest(common.unittest.makeSuite(BigTablesTestCase)) + + return theSuite + + +if __name__ == '__main__': + common.parse_argv(sys.argv) + common.print_versions() + common.unittest.main(defaultTest='suite') diff --git a/tables/tests/test_tablesMD.py b/tables/tests/test_tablesMD.py new file mode 100644 index 0000000..6c6953c --- /dev/null +++ b/tables/tests/test_tablesMD.py @@ -0,0 +1,2250 @@ +import sys + +import numpy as np + +import tables as tb +from tables.tests import common + + +# It is important that columns are ordered according to their names +# to ease the comparison with structured arrays. + +# Test Record class +class Record(tb.IsDescription): + var0 = tb.StringCol(itemsize=4, dflt=b"", shape=2) # 4-char str array + var1 = tb.StringCol(itemsize=4, dflt=[b"abcd", b"efgh"], shape=(2, 2)) + var1_ = tb.IntCol(dflt=((1, 1),), shape=2) # integer array + var2 = tb.IntCol(dflt=((1, 1), (1, 1)), shape=(2, 2)) # integer array + var3 = tb.Int16Col(dflt=2) # short integer + var4 = tb.FloatCol(dflt=3.1) # double (double-precision) + var5 = tb.Float32Col(dflt=4.2) # float (single-precision) + var6 = tb.UInt16Col(dflt=5) # unsigned short integer + var7 = tb.StringCol(itemsize=1, dflt=b"e") # 1-character String + + +# Dictionary definition +RecordDescriptionDict = { + 'var0': tb.StringCol(itemsize=4, dflt=b"", shape=2), # 4-char str array + 'var1': tb.StringCol(itemsize=4, dflt=[b"abcd", b"efgh"], shape=(2, 2)), + # 'var0': StringCol(itemsize=4, shape=2), # 4-character String + # 'var1': StringCol(itemsize=4, shape=(2,2)), # 4-character String + 'var1_': tb.IntCol(shape=2), # integer array + 'var2': tb.IntCol(shape=(2, 2)), # integer array + 'var3': tb.Int16Col(), # short integer + 'var4': tb.FloatCol(), # double (double-precision) + 'var5': tb.Float32Col(), # float (single-precision) + 'var6': tb.Int16Col(), # unsigned short integer + 'var7': tb.StringCol(itemsize=1), # 1-character String +} + +# Record class with numpy dtypes (mixed shapes is checked here) + + +class RecordDT(tb.IsDescription): + var0 = tb.Col.from_dtype(np.dtype("2S4"), dflt=b"") # shape in dtype + var1 = tb.Col.from_dtype(np.dtype(("S4", ( + 2, 2))), dflt=[b"abcd", b"efgh"]) # shape is a mix + var1_ = tb.Col.from_dtype( + np.dtype("2i4"), dflt=((1, 1),)) # shape in dtype + var2 = tb.Col.from_sctype("i4", shape=( + 2, 2), dflt=((1, 1), (1, 1))) # shape is a mix + var3 = tb.Col.from_dtype(np.dtype("i2"), dflt=2) + var4 = tb.Col.from_dtype(np.dtype("2f8"), dflt=3.1) + var5 = tb.Col.from_dtype(np.dtype("f4"), dflt=4.2) + var6 = tb.Col.from_dtype(np.dtype("()u2"), dflt=5) + var7 = tb.Col.from_dtype(np.dtype("S1"), dflt=b"e") # no shape + + +class BasicTestCase(common.TempFileMixin, common.PyTablesTestCase): + # file = "test.h5" + open_mode = "w" + title = "This is the table title" + expectedrows = 100 + appendrows = 20 + compress = 0 + complib = "zlib" # Default compression library + record = Record + recarrayinit = 0 + maxshort = 1 << 15 + + def setUp(self): + super().setUp() + + # Create an instance of an HDF5 Table + self.rootgroup = self.h5file.root + self.populateFile() + self.h5file.close() + + def initRecArray(self): + record = self.recordtemplate + row = record[0] + buflist = [] + # Fill the recarray + for i in range(self.expectedrows): + tmplist = [] + # Both forms (list or chararray) works + var0 = ['%04d' % (self.expectedrows - i)] * 2 + tmplist.append(var0) + var1 = [['%04d' % (self.expectedrows - i)] * 2] * 2 + tmplist.append(var1) + var1_ = (i, 1) + tmplist.append(var1_) + var2 = ((i, 1), (1, 1)) # *-* + tmplist.append(var2) + var3 = i % self.maxshort + tmplist.append(var3) + if isinstance(row['var4'], np.ndarray): + tmplist.append([float(i), float(i * i)]) + else: + tmplist.append(float(i)) + if isinstance(row['var5'], np.ndarray): + tmplist.append(np.array((float(i),)*4)) + else: + tmplist.append(float(i)) + # var6 will be like var3 but byteswaped + tmplist.append(((var3 >> 8) & 0xff) + ((var3 << 8) & 0xff00)) + var7 = var1[0][0][-1] + tmplist.append(var7) + buflist.append(tuple(tmplist)) + + self.record = np.rec.array(buflist, dtype=record.dtype, + shape=self.expectedrows) + + def populateFile(self): + group = self.rootgroup + if self.recarrayinit: + # Initialize an starting buffer, if any + self.initRecArray() + for j in range(3): + # Create a table + filters = tb.Filters(complevel=self.compress, complib=self.complib) + if j < 2: + byteorder = sys.byteorder + else: + # table2 will be byteswapped + byteorder = {"little": "big", "big": "little"}[sys.byteorder] + table = self.h5file.create_table(group, 'table'+str(j), + self.record, + title=self.title, + filters=filters, + expectedrows=self.expectedrows, + byteorder=byteorder) + if not self.recarrayinit: + # Get the row object associated with the new table + row = table.row + + # Fill the table + for i in range(self.expectedrows): + s = '%04d' % (self.expectedrows - i) + row['var0'] = s.encode('ascii') + row['var1'] = s.encode('ascii') + row['var7'] = s[-1].encode('ascii') + row['var1_'] = (i, 1) + row['var2'] = ((i, 1), (1, 1)) # *-* + row['var3'] = i % self.maxshort + if isinstance(row['var4'], np.ndarray): + row['var4'] = [float(i), float(i * i)] + else: + row['var4'] = float(i) + if isinstance(row['var5'], np.ndarray): + row['var5'] = np.array((float(i),)*4) + else: + row['var5'] = float(i) + # var6 will be like var3 but byteswaped + row['var6'] = (((row['var3'] >> 8) & 0xff) + + ((row['var3'] << 8) & 0xff00)) + row.append() + + # Flush the buffer for this table + table.flush() + # Create a new group (descendant of group) + group2 = self.h5file.create_group(group, 'group'+str(j)) + # Iterate over this new group (group2) + group = group2 + + def test00_description(self): + """Checking table description and descriptive fields.""" + + self.h5file = tb.open_file(self.h5fname) + + tbl = self.h5file.get_node('/table0') + desc = tbl.description + + if isinstance(self.record, dict): + columns = self.record + elif isinstance(self.record, np.ndarray): + descr, _ = tb.description.descr_from_dtype(self.record.dtype) + columns = descr._v_colobjects + elif isinstance(self.record, np.dtype): + descr, _ = tb.description.descr_from_dtype(self.record) + columns = descr._v_colobjects + else: + # This is an ordinary description. + columns = self.record.columns + + # Check table and description attributes at the same time. + # These checks are only valid for non-nested tb. + + # Column names. + expectedNames = ['var0', 'var1', 'var1_', 'var2', 'var3', 'var4', + 'var5', 'var6', 'var7'] + self.assertEqual(expectedNames, list(tbl.colnames)) + self.assertEqual(expectedNames, list(desc._v_names)) + + # Column types. + expectedTypes = [columns[colname].dtype + for colname in expectedNames] + self.assertEqual(expectedTypes, + [tbl.coldtypes[v] for v in expectedNames]) + self.assertEqual(expectedTypes, + [desc._v_dtypes[v] for v in expectedNames]) + + # Column string types. + expectedTypes = [columns[colname].type + for colname in expectedNames] + self.assertEqual(expectedTypes, + [tbl.coltypes[v] for v in expectedNames]) + self.assertEqual(expectedTypes, + [desc._v_types[v] for v in expectedNames]) + + # Column defaults. + for v in expectedNames: + if common.verbose: + print("dflt-->", columns[v].dflt) + print("coldflts-->", tbl.coldflts[v]) + print("desc.dflts-->", desc._v_dflts[v]) + self.assertTrue(common.areArraysEqual(tbl.coldflts[v], + columns[v].dflt)) + self.assertTrue(common.areArraysEqual(desc._v_dflts[v], + columns[v].dflt)) + + # Column path names. + self.assertEqual(expectedNames, list(desc._v_pathnames)) + + # Column objects. + for colName in expectedNames: + expectedCol = columns[colName] + col = desc._v_colobjects[colName] + self.assertEqual(expectedCol.dtype, col.dtype) + self.assertEqual(expectedCol.type, col.type) + + def test01_readTable(self): + """Checking table read and cuts.""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test01_readTable..." % self.__class__.__name__) + + # Create an instance of an HDF5 Table + self.h5file = tb.open_file(self.h5fname, "r") + table = self.h5file.get_node("/table0") + + # Choose a small value for buffer size + table.nrowsinbuf = 3 + # Read the records and select those with "var2" file less than 20 + result = [r['var2'][0][0] for r in table.iterrows() + if r['var2'][0][0] < 20] + + if common.verbose: + print("Table:", repr(table)) + print("Nrows in", table._v_pathname, ":", table.nrows) + print("Last record in table ==>", r) + print("Total selected records in table ==> ", len(result)) + nrows = self.expectedrows - 1 + r = [r for r in table.iterrows() if r['var2'][0][0] < 20][-1] + self.assertEqual(( + r['var0'][0], + r['var1'][0][0], + r['var1_'][0], + r['var2'][0][0], + r['var7'] + ), (b"0001", b"0001", nrows, nrows, b"1")) + if isinstance(r['var5'], np.ndarray): + self.assertTrue(common.allequal( + r['var5'], np.array((nrows,)*4, np.float32))) + else: + self.assertEqual(r['var5'], float(nrows)) + self.assertEqual(len(result), 20) + + def test01b_readTable(self): + """Checking table read and cuts (multidimensional columns case)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test01b_readTable..." % self.__class__.__name__) + + # Create an instance of an HDF5 Table + self.h5file = tb.open_file(self.h5fname, "r") + table = self.h5file.get_node("/table0") + + # Choose a small value for buffer size + table.nrowsinbuf = 3 + # Read the records and select those with "var2" file less than 20 + result1 = [r['var5'] for r in table.iterrows() if r['var2'][0][0] < 20] + if common.verbose: + print("Nrows in", table._v_pathname, ":", table.nrows) + print("Last record in table ==>", r) + print("Total selected records in table ==> ", len(result1)) + nrows = table.nrows + result2 = [r for r in table.iterrows() if r['var2'][0][0] < 20][-1] + if isinstance(result2['var5'], np.ndarray): + self.assertTrue(common.allequal( + result1[0], np.array((float(0),) * 4, np.float32))) + self.assertTrue(common.allequal( + result1[1], np.array((float(1),) * 4, np.float32))) + self.assertTrue(common.allequal( + result1[2], np.array((float(2),) * 4, np.float32))) + self.assertTrue(common.allequal( + result1[3], np.array((float(3),) * 4, np.float32))) + self.assertTrue(common.allequal( + result1[10], np.array((float(10),) * 4, np.float32))) + self.assertTrue(common.allequal( + result2['var5'], np.array((float(nrows - 1),) * 4, np.float32) + )) + else: + self.assertEqual(result2['var5'], float(nrows - 1)) + self.assertEqual(len(result1), 20) + + # Read the records and select those with "var2" file less than 20 + result1 = [r['var1'] for r in table.iterrows() if r['var2'][0][0] < 20] + result2 = [r for r in table.iterrows() if r['var2'][0][0] < 20][-1] + + if result2['var1'].dtype.char == "S": + a = np.array([['%04d' % (self.expectedrows - 0)]*2]*2, 'S') + self.assertTrue(common.allequal(result1[0], a)) + a = np.array([['%04d' % (self.expectedrows - 1)]*2]*2, 'S') + self.assertTrue(common.allequal(result1[1], a)) + a = np.array([['%04d' % (self.expectedrows - 2)]*2]*2, 'S') + self.assertTrue(common.allequal(result1[2], a)) + a = np.array([['%04d' % (self.expectedrows - 3)]*2]*2, 'S') + self.assertTrue(common.allequal(result1[3], a)) + a = np.array([['%04d' % (self.expectedrows - 10)]*2]*2, 'S') + self.assertTrue(common.allequal(result1[10], a)) + a = np.array([['%04d' % (1)]*2]*2, 'S') + self.assertTrue(common.allequal(result2['var1'], a)) + else: + self.assertEqual(result1['var1'], "0001") + self.assertEqual(len(result1), 20) + + def test01c_readTable(self): + """Checking shape of multidimensional columns.""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test01c_readTable..." % self.__class__.__name__) + + # Create an instance of an HDF5 Table + self.h5file = tb.open_file(self.h5fname, "r") + table = self.h5file.get_node("/table0") + + if common.verbose: + print("var2 col shape:", table.cols.var2.shape) + print("Should be:", table.cols.var2[:].shape) + self.assertEqual(table.cols.var2.shape, table.cols.var2[:].shape) + + def test02_AppendRows(self): + """Checking whether appending record rows works or not.""" + + # Now, open it, but in "append" mode + self.h5file = tb.open_file(self.h5fname, mode="a") + self.rootgroup = self.h5file.root + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test02_AppendRows..." % self.__class__.__name__) + + # Get a table + table = self.h5file.get_node("/group0/table1") + # Get their row object + row = table.row + if common.verbose: + print("Nrows in old", table._v_pathname, ":", table.nrows) + print("Record Format ==>", table.description._v_nested_formats) + print("Record Size ==>", table.rowsize) + # Append some rows + for i in range(self.appendrows): + s = '%04d' % (self.appendrows - i) + row['var0'] = s.encode('ascii') + row['var1'] = s.encode('ascii') + row['var7'] = s[-1].encode('ascii') + row['var1_'] = (i, 1) + row['var2'] = ((i, 1), (1, 1)) # *-* + row['var3'] = i % self.maxshort + if isinstance(row['var4'], np.ndarray): + row['var4'] = [float(i), float(i * i)] + else: + row['var4'] = float(i) + if isinstance(row['var5'], np.ndarray): + row['var5'] = np.array((float(i),)*4) + else: + row['var5'] = float(i) + row.append() + + # Flush the buffer for this table and read it + table.flush() + result = [r['var2'][0][0] for r in table.iterrows() + if r['var2'][0][0] < 20] + row = [r for r in table.iterrows() if r['var2'][0][0] < 20][-1] + + nrows = self.appendrows - 1 + self.assertEqual(( + row['var0'][0], + row['var1'][0][0], + row['var1_'][0], + row['var2'][0][0], + row['var7']), + (b"0001", b"0001", nrows, nrows, b"1")) + if isinstance(row['var5'], np.ndarray): + self.assertTrue(common.allequal( + row['var5'], np.array((float(nrows),) * 4, np.float32))) + else: + self.assertEqual(row['var5'], float(nrows)) + if self.appendrows <= 20: + add = self.appendrows + else: + add = 20 + self.assertEqual(len(result), 20 + add) # because we appended new rows + # del table + + # CAVEAT: The next test only works for tables with rows < 2**15 + def test03_endianess(self): + """Checking if table is endianess aware.""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test03_endianess..." % self.__class__.__name__) + + # Create an instance of an HDF5 Table + self.h5file = tb.open_file(self.h5fname, "r") + table = self.h5file.get_node("/group0/group1/table2") + + # Read the records and select the ones with "var3" column less than 20 + result = [r['var2'] for r in table.iterrows() if r['var3'] < 20] + if common.verbose: + print("Nrows in", table._v_pathname, ":", table.nrows) + print("On-disk byteorder ==>", table.byteorder) + print("Last record in table ==>", r) + print("Total selected records in table ==>", len(result)) + nrows = self.expectedrows - 1 + r = list(table.iterrows())[-1] + self.assertEqual((r['var1'][0][0], r['var3']), (b"0001", nrows)) + self.assertEqual(len(result), 20) + + +class BasicWriteTestCase(BasicTestCase): + title = "BasicWrite" + + +class DictWriteTestCase(BasicTestCase): + # This checks also unidimensional arrays as columns + title = "DictWrite" + record = RecordDescriptionDict + nrows = 21 + nrowsinbuf = 3 # Choose a small value for the buffer size + start = 0 + stop = 10 + step = 3 + + +class RecordDTWriteTestCase(BasicTestCase): + title = "RecordDTWriteTestCase" + record = RecordDT + + +# Pure NumPy dtype +class NumPyDTWriteTestCase(BasicTestCase): + title = "NumPyDTWriteTestCase" + record = np.dtype("(2,)S4,(2,2)S4,(2,)i4,(2,2)i4,i2,2f8,f4,i2,S1") + record.names = 'var0,var1,var1_,var2,var3,var4,var5,var6,var7'.split(',') + + +class RecArrayOneWriteTestCase(BasicTestCase): + title = "RecArrayOneWrite" + record = np.rec.array( + None, + formats="(2,)S4,(2,2)S4,(2,)i4,(2,2)i4,i2,2f8,f4,i2,S1", + names='var0,var1,var1_,var2,var3,var4,var5,var6,var7', + shape=0) + + +class RecArrayTwoWriteTestCase(BasicTestCase): + title = "RecArrayTwoWrite" + expectedrows = 100 + recarrayinit = 1 + recordtemplate = np.rec.array( + None, + formats="(2,)a4,(2,2)a4,(2,)i4,(2,2)i4,i2,f8,f4,i2,a1", + names='var0,var1,var1_,var2,var3,var4,var5,var6,var7', + shape=1) + + +class RecArrayThreeWriteTestCase(BasicTestCase): + title = "RecArrayThreeWrite" + expectedrows = 100 + recarrayinit = 1 + recordtemplate = np.rec.array( + None, + formats="(2,)a4,(2,2)a4,(2,)i4,(2,2)i4,i2,2f8,4f4,i2,a1", + names='var0,var1,var1_,var2,var3,var4,var5,var6,var7', + shape=1) + + +class RecArrayAlignedWriteTestCase(BasicTestCase): + title = "RecArrayThreeWrite" + expectedrows = 100 + recarrayinit = 1 + recordtemplate = np.rec.array( + None, + formats="(2,)a4,(2,2)a4,(2,)i4,(2,2)i4,i2,2f8,4f4,i2,a1", + names='var0,var1,var1_,var2,var3,var4,var5,var6,var7', + shape=1, aligned=True) + + +@common.unittest.skipIf(not common.blosc_avail, + 'BLOSC compression library not available') +class CompressBloscTablesTestCase(BasicTestCase): + title = "CompressBloscTables" + compress = 1 + complib = "blosc" + + +@common.unittest.skipIf(not common.lzo_avail, + 'LZO compression library not available') +class CompressLZOTablesTestCase(BasicTestCase): + title = "CompressLZOTables" + compress = 1 + complib = "lzo" + + +@common.unittest.skipIf(not common.bzip2_avail, + 'BZIP2 compression library not available') +class CompressBzip2TablesTestCase(BasicTestCase): + title = "CompressBzip2Tables" + compress = 1 + complib = "bzip2" + + +class CompressZLIBTablesTestCase(BasicTestCase): + title = "CompressOneTables" + compress = 1 + complib = "zlib" + + +class CompressTwoTablesTestCase(BasicTestCase): + title = "CompressTwoTables" + compress = 1 + # This checks also unidimensional arrays as columns + record = RecordDescriptionDict + + +class BigTablesTestCase(BasicTestCase): + title = "BigTables" + # 10000 rows takes much more time than we can afford for tests + # reducing to 1000 would be more than enough + # F. Alted 2004-01-19 + + # expectedrows = 10000 + # appendrows = 1000 + expectedrows = 1000 + appendrows = 100 + + +class BasicRangeTestCase(common.TempFileMixin, common.PyTablesTestCase): + # file = "test.h5" + open_mode = "w" + title = "This is the table title" + record = Record + maxshort = 1 << 15 + expectedrows = 100 + compress = 0 + # Default values + nrows = 20 + nrowsinbuf = 3 # Choose a small value for the buffer size + start = 1 + stop = nrows + checkrecarray = 0 + checkgetCol = 0 + + def setUp(self): + super().setUp() + + # Create an instance of an HDF5 Table + self.rootgroup = self.h5file.root + self.populateFile() + self.h5file.close() + + def populateFile(self): + group = self.rootgroup + for j in range(3): + # Create a table + table = self.h5file.create_table( + group, 'table'+str(j), self.record, title=self.title, + filters=tb.Filters(self.compress), + expectedrows=self.expectedrows) + # Get the row object associated with the new table + row = table.row + + # Fill the table + for i in range(self.expectedrows): + row['var1'] = '%04d' % (self.expectedrows - i) + row['var7'] = row['var1'][0][0][-1] + row['var2'] = i + row['var3'] = i % self.maxshort + if isinstance(row['var4'], np.ndarray): + row['var4'] = [float(i), float(i * i)] + else: + row['var4'] = float(i) + if isinstance(row['var5'], np.ndarray): + row['var5'] = np.array((float(i),)*4) + else: + row['var5'] = float(i) + # var6 will be like var3 but byteswaped + row['var6'] = (((row['var3'] >> 8) & 0xff) + + ((row['var3'] << 8) & 0xff00)) + row.append() + + # Flush the buffer for this table + table.flush() + # Create a new group (descendant of group) + group2 = self.h5file.create_group(group, 'group'+str(j)) + # Iterate over this new group (group2) + group = group2 + + def check_range(self): + # Create an instance of an HDF5 Table + self.h5file = tb.open_file(self.h5fname, "r") + table = self.h5file.get_node("/table0") + + table.nrowsinbuf = self.nrowsinbuf + resrange = slice(self.start, self.stop, self.step).indices(table.nrows) + reslength = len(list(range(*resrange))) + if self.checkrecarray: + recarray = table.read(self.start, self.stop, self.step) + result = [] + for nrec in range(len(recarray)): + if recarray['var2'][nrec][0][0] < self.nrows and 0 < self.step: + result.append(recarray['var2'][nrec][0][0]) + elif (recarray['var2'][nrec][0][0] > self.nrows and + 0 > self.step): + result.append(recarray['var2'][nrec][0][0]) + elif self.checkgetCol: + column = table.read(self.start, self.stop, self.step, 'var2') + result = [] + for nrec in range(len(column)): + if column[nrec][0][0] < self.nrows and 0 < self.step: # *-* + result.append(column[nrec][0][0]) # *-* + elif column[nrec][0][0] > self.nrows and 0 > self.step: # *-* + result.append(column[nrec][0][0]) # *-* + else: + if 0 < self.step: + result = [ + r['var2'][0][0] for r in table.iterrows(self.start, + self.stop, + self.step) + if r['var2'][0][0] < self.nrows + ] + elif 0 > self.step: + result = [ + r['var2'][0][0] for r in table.iterrows(self.start, + self.stop, + self.step) + if r['var2'][0][0] > self.nrows + ] + + if self.start < 0: + startr = self.expectedrows + self.start + else: + startr = self.start + + if self.stop is None: + if self.checkrecarray or self.checkgetCol: + # data read using the read method + stopr = startr + 1 + else: + # data read using the iterrows method + stopr = self.nrows + elif self.stop < 0: + stopr = self.expectedrows + self.stop + else: + stopr = self.stop + + if self.nrows < stopr: + stopr = self.nrows + + if common.verbose: + print("Nrows in", table._v_pathname, ":", table.nrows) + if reslength: + if self.checkrecarray: + print("Last record *read* in recarray ==>", recarray[-1]) + elif self.checkgetCol: + print("Last value *read* in getCol ==>", column[-1]) + else: + print("Last record *read* in table range ==>", r) + print("Total number of selected records ==>", len(result)) + print("Selected records:\n", result) + print("Selected records should look like:\n", + list(range(startr, stopr, self.step))) + print("start, stop, step ==>", startr, stopr, self.step) + + self.assertEqual(result, list(range(startr, stopr, self.step))) + if not (self.checkrecarray or self.checkgetCol): + if startr < stopr and 0 < self.step: + r = [r['var2'] for r in table.iterrows(self.start, self.stop, + self.step) + if r['var2'][0][0] < self.nrows][-1] + if self.nrows > self.expectedrows: + self.assertEqual( + r[0][0], + list(range(self.start, self.stop, self.step))[-1]) + else: + self.assertEqual(r[0][0], + list(range(startr, stopr, self.step))[-1]) + elif startr > stopr and 0 > self.step: + r = [r['var2'] for r in table.iterrows(self.start, self.stop, + self.step) + if r['var2'][0][0] > self.nrows][0] + if self.nrows < self.expectedrows: + self.assertEqual( + r[0][0], + list(range(self.start, self.stop or -1, self.step))[0]) + else: + self.assertEqual( + r[0][0], + list(range(startr, stopr or -1, self.step))[0]) + + # Close the file + self.h5file.close() + + def test01_range(self): + """Checking ranges in table iterators (case1)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test01_range..." % self.__class__.__name__) + + # Case where step < nrowsinbuf < 2 * step + self.nrows = 21 + self.nrowsinbuf = 3 + self.start = 0 + self.stop = self.expectedrows + self.step = 2 + + self.check_range() + + def test01a_range(self): + """Checking ranges in table iterators (case1)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test01_range..." % self.__class__.__name__) + + # Case where step < nrowsinbuf < 2 * step + self.nrows = 21 + self.nrowsinbuf = 3 + self.start = self.expectedrows - 1 + self.stop = None + self.step = -2 + + self.check_range() + + def test02_range(self): + """Checking ranges in table iterators (case2)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test02_range..." % self.__class__.__name__) + + # Case where step < nrowsinbuf < 10 * step + self.nrows = 21 + self.nrowsinbuf = 31 + self.start = 11 + self.stop = self.expectedrows + self.step = 3 + + self.check_range() + + def test03_range(self): + """Checking ranges in table iterators (case3)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test03_range..." % self.__class__.__name__) + + # Case where step < nrowsinbuf < 1.1 * step + self.nrows = self.expectedrows + self.nrowsinbuf = 11 # Choose a small value for the buffer size + self.start = 0 + self.stop = self.expectedrows + self.step = 10 + + self.check_range() + + def test04_range(self): + """Checking ranges in table iterators (case4)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test04_range..." % self.__class__.__name__) + + # Case where step == nrowsinbuf + self.nrows = self.expectedrows + self.nrowsinbuf = 11 # Choose a small value for the buffer size + self.start = 1 + self.stop = self.expectedrows + self.step = 11 + + self.check_range() + + def test05_range(self): + """Checking ranges in table iterators (case5)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test05_range..." % self.__class__.__name__) + + # Case where step > 1.1 * nrowsinbuf + self.nrows = 21 + self.nrowsinbuf = 10 # Choose a small value for the buffer size + self.start = 1 + self.stop = self.expectedrows + self.step = 11 + + self.check_range() + + def test06_range(self): + """Checking ranges in table iterators (case6)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test06_range..." % self.__class__.__name__) + + # Case where step > 3 * nrowsinbuf + self.nrows = 3 + self.nrowsinbuf = 3 # Choose a small value for the buffer size + self.start = 2 + self.stop = self.expectedrows + self.step = 10 + + self.check_range() + + def test07_range(self): + """Checking ranges in table iterators (case7)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test07_range..." % self.__class__.__name__) + + # Case where start == stop + self.nrows = 2 + self.nrowsinbuf = 3 # Choose a small value for the buffer size + self.start = self.nrows + self.stop = self.nrows + self.step = 10 + + self.check_range() + + def test08_range(self): + """Checking ranges in table iterators (case8)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test08_range..." % self.__class__.__name__) + + # Case where start > stop + self.nrows = 2 + self.nrowsinbuf = 3 # Choose a small value for the buffer size + self.start = self.nrows + 1 + self.stop = self.nrows + self.step = 1 + + self.check_range() + + def test09_range(self): + """Checking ranges in table iterators (case9)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test09_range..." % self.__class__.__name__) + + # Case where stop = None + self.nrows = 100 + self.nrowsinbuf = 3 # Choose a small value for the buffer size + self.start = 1 + self.stop = 2 + self.step = 1 + + self.check_range() + + def test10_range(self): + """Checking ranges in table iterators (case10)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test10_range..." % self.__class__.__name__) + + # Case where start < 0 and stop = 0 + self.nrows = self.expectedrows + self.nrowsinbuf = 5 # Choose a small value for the buffer size + self.start = -6 + self.startr = self.expectedrows + self.start + self.stop = 0 + self.stopr = self.expectedrows + self.stop + self.step = 2 + + self.check_range() + + def test11_range(self): + """Checking ranges in table iterators (case11)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test11_range..." % self.__class__.__name__) + + # Case where start < 0 and stop < 0 + self.nrows = self.expectedrows + self.nrowsinbuf = 5 # Choose a small value for the buffer size + self.start = -6 + self.startr = self.expectedrows + self.start + self.stop = -2 + self.stopr = self.expectedrows + self.stop + self.step = 1 + + self.check_range() + + def test12_range(self): + """Checking ranges in table iterators (case12)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test12_range..." % self.__class__.__name__) + + # Case where start < 0 and stop < 0 and start > stop + self.nrows = self.expectedrows + self.nrowsinbuf = 5 # Choose a small value for the buffer size + self.start = -1 + self.startr = self.expectedrows + self.start + self.stop = -2 + self.stopr = self.expectedrows + self.stop + self.step = 1 + + self.check_range() + + def test13_range(self): + """Checking ranges in table iterators (case13)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test13_range..." % self.__class__.__name__) + + # Case where step < 0 + self.step = -11 + try: + self.check_range() + except ValueError: + if common.verbose: + (type, value, traceback) = sys.exc_info() + print("\nGreat!, the next ValueError was catched!") + self.h5file.close() + # else: + # self.fail("expected a ValueError") + + # Case where step == 0 + self.step = 0 + try: + self.check_range() + except ValueError: + if common.verbose: + (type, value, traceback) = sys.exc_info() + print("\nGreat!, the next ValueError was catched!") + self.h5file.close() + # else: + # self.fail("expected a ValueError") + + +class IterRangeTestCase(BasicRangeTestCase): + pass + + +class RecArrayRangeTestCase(BasicRangeTestCase): + checkrecarray = 1 + + +class GetColRangeTestCase(BasicRangeTestCase): + checkgetCol = 1 + + def test01_nonexistentField(self): + """Checking non-existing Field in getCol method """ + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test01_nonexistentField..." % + self.__class__.__name__) + + # Create an instance of an HDF5 Table + self.h5file = tb.open_file(self.h5fname, "r") + self.root = self.h5file.root + table = self.h5file.get_node("/table0") + + with self.assertRaises(KeyError): + table.read(field='non-existent-column') + + +class Rec(tb.IsDescription): + col1 = tb.IntCol(pos=1, shape=(2,)) + col2 = tb.StringCol(itemsize=3, pos=2, shape=(3,)) + col3 = tb.FloatCol(pos=3, shape=(3, 2)) + + +class RecArrayIO(common.TempFileMixin, common.PyTablesTestCase): + + def test00(self): + """Checking saving a normal recarray""" + + # Create a recarray + intlist1 = [[456, 23]*3]*2 + intlist2 = np.array([[2, 2]*3]*2, dtype=int) + arrlist1 = [['dbe']*2]*3 + arrlist2 = [['de']*2]*3 + floatlist1 = [[1.2, 2.3]*3]*4 + floatlist2 = np.array([[4.5, 2.4]*3]*4) + b = [(intlist1, arrlist1, floatlist1), ( + intlist2, arrlist2, floatlist2)] + r = np.rec.array(b, formats='(2,6)i4,(3,2)a3,(4,6)f8', + names='col1,col2,col3') + + # Save it in a table: + self.h5file.create_table(self.h5file.root, 'recarray', r) + + # Read it again + r2 = self.h5file.root.recarray.read() + + self.assertEqual(r.tobytes(), r2.tobytes()) + + def test01(self): + """Checking saving a recarray with an offset in its buffer""" + + # Create a recarray + intlist1 = [[456, 23]*3]*2 + intlist2 = np.array([[2, 2]*3]*2, dtype=int) + arrlist1 = [['dbe']*2]*3 + arrlist2 = [['de']*2]*3 + floatlist1 = [[1.2, 2.3]*3]*4 + floatlist2 = np.array([[4.5, 2.4]*3]*4) + b = [(intlist1, arrlist1, floatlist1), ( + intlist2, arrlist2, floatlist2)] + r = np.rec.array(b, formats='(2,6)i4,(3,2)a3,(4,6)f8', + names='col1,col2,col3') + + # Get a view of the recarray + r1 = r[1:] + + # Save it in a table: + self.h5file.create_table(self.h5file.root, 'recarray', r1) + + # Read it again + r2 = self.h5file.root.recarray.read() + + self.assertEqual(r1.tobytes(), r2.tobytes()) + + def test02(self): + """Checking saving a slice of a large recarray""" + + # Create a recarray + intlist1 = [[[23, 24, 35]*6]*6] + intlist2 = np.array([[[2, 3, 4]*6]*6], dtype=int) + arrlist1 = [['dbe']*2]*3 + arrlist2 = [['de']*2]*3 + floatlist1 = [[1.2, 2.3]*3]*4 + floatlist2 = np.array([[4.5, 2.4]*3]*4) + b = [(intlist1, arrlist1, floatlist1), ( + intlist2, arrlist2, floatlist2)] + r = np.rec.array(b * 300, formats='(1,6,18)i4,(3,2)a3,(4,6)f8', + names='col1,col2,col3') + + # Get an slice of recarray + r1 = r[290:292] + + # Save it in a table: + self.h5file.create_table(self.h5file.root, 'recarray', r1) + + # Read it again + r2 = self.h5file.root.recarray.read() + + self.assertEqual(r1.tobytes(), r2.tobytes()) + + def test03(self): + """Checking saving a slice of an strided recarray""" + + # Create a recarray + intlist1 = [[[23, 24, 35]*6]*6] + intlist2 = np.array([[[2, 3, 4]*6]*6], dtype=int) + arrlist1 = [['dbe']*2]*3 + arrlist2 = [['de']*2]*3 + floatlist1 = [[1.2, 2.3]*3]*4 + floatlist2 = np.array([[4.5, 2.4]*3]*4) + b = [(intlist1, arrlist1, floatlist1), ( + intlist2, arrlist2, floatlist2)] + r = np.rec.array(b * 300, formats='(1,6,18)i4,(3,2)a3,(4,6)f8', + names='col1,col2,col3', shape=600) + + # Get an strided recarray + r2 = r[::2] + + # Get a slice + r1 = r2[148:] + + # Save it in a table: + self.h5file.create_table(self.h5file.root, 'recarray', r1) + + # Read it again + r2 = self.h5file.root.recarray.read() + + self.assertEqual(r1.tobytes(), r2.tobytes()) + + def test08a(self): + """Checking modifying one column (single column version, list)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test08a..." % self.__class__.__name__) + + # Create a new table: + table = self.h5file.create_table(self.h5file.root, 'recarray', Rec) + + # Append new rows + s0, s1, s2, s3 = ['dbe']*3, ['ded']*3, ['db1']*3, ['de1']*3 + f0, f1, f2, f3 = [[1.2]*2]*3, [[1.3]*2]*3, [[1.4]*2]*3, [[1.5]*2]*3 + r = np.rec.array([([456, 457], s0, f0), ([2, 3], s1, f1)], + formats="(2,)i4,(3,)a3,(3,2)f8") + table.append(r) + table.append([([457, 458], s2, f2), ([5, 6], s3, f3)]) + + # Modify just one existing column + table.cols.col1[1:] = [[[2, 3], [3, 4], [4, 5]]] + + # Create the modified recarray + r1 = np.rec.array([([456, 457], s0, f0), ([2, 3], s1, f1), + ([3, 4], s2, f2), ([4, 5], s3, f3)], + formats="(2,)i4,(3,)a3,(3,2)f8", + names="col1,col2,col3") + + # Read the modified table + r2 = table.read() + if common.verbose: + print("Original table-->", repr(r2)) + print("Should look like-->", repr(r1)) + self.assertEqual(r1.tobytes(), r2.tobytes()) + self.assertEqual(table.nrows, 4) + + def test08b(self): + """Checking modifying one column (single column version, recarray)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test08b..." % self.__class__.__name__) + + # Create a new table: + table = self.h5file.create_table(self.h5file.root, 'recarray', Rec) + + # Append new rows + s0, s1, s2, s3 = ['dbe']*3, ['ded']*3, ['db1']*3, ['de1']*3 + f0, f1, f2, f3 = [[1.2]*2]*3, [[1.3]*2]*3, [[1.4]*2]*3, [[1.5]*2]*3 + r = np.rec.array([([456, 457], s0, f0), ([2, 3], s1, f1)], + formats="(2,)i4,(3,)a3,(3,2)f8") + table.append(r) + table.append([([457, 458], s2, f2), ([5, 6], s3, f3)]) + + # Modify just one existing column + columns = np.rec.fromarrays( + np.array([[[2, 3], [3, 4], [4, 5]]]), formats="i4") + table.modify_columns(start=1, columns=columns, names=["col1"]) + + # Create the modified recarray + r1 = np.rec.array([([456, 457], s0, f0), ([2, 3], s1, f1), + ([3, 4], s2, f2), ([4, 5], s3, f3)], + formats="(2,)i4,(3,)a3,(3,2)f8", + names="col1,col2,col3") + + # Read the modified table + r2 = table.read() + if common.verbose: + print("Original table-->", repr(r2)) + print("Should look like-->", repr(r1)) + self.assertEqual(r1.tobytes(), r2.tobytes()) + self.assertEqual(table.nrows, 4) + + def test08b2(self): + """Checking modifying one column (single column version, recarray, + modify_column)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test08b2..." % self.__class__.__name__) + + # Create a new table: + table = self.h5file.create_table(self.h5file.root, 'recarray', Rec) + + # Append new rows + s0, s1, s2, s3 = ['dbe']*3, ['ded']*3, ['db1']*3, ['de1']*3 + f0, f1, f2, f3 = [[1.2]*2]*3, [[1.3]*2]*3, [[1.4]*2]*3, [[1.5]*2]*3 + r = np.rec.array([([456, 457], s0, f0), ([2, 3], s1, f1)], + formats="(2,)i4,(3,)a3,(3,2)f8") + table.append(r) + table.append([([457, 458], s2, f2), ([5, 6], s3, f3)]) + + # Modify just one existing column + columns = np.rec.fromarrays( + np.array([[[2, 3], [3, 4], [4, 5]]]), formats="i4") + table.modify_column(start=1, column=columns, colname="col1") + + # Create the modified recarray + r1 = np.rec.array([([456, 457], s0, f0), ([2, 3], s1, f1), + ([3, 4], s2, f2), ([4, 5], s3, f3)], + formats="(2,)i4,(3,)a3,(3,2)f8", + names="col1,col2,col3") + + # Read the modified table + r2 = table.read() + if common.verbose: + print("Original table-->", repr(r2)) + print("Should look like-->", repr(r1)) + self.assertEqual(r1.tobytes(), r2.tobytes()) + self.assertEqual(table.nrows, 4) + + +class DefaultValues(common.TempFileMixin, common.PyTablesTestCase): + + def test00(self): + """Checking saving a Table MD with default values""" + + # Create a table + table = self.h5file.create_table(self.h5file.root, 'table', Record) + + # Take a number of records a bit large + # nrows = int(table.nrowsinbuf * 1.1) + nrows = 5 # for test + # Fill the table with nrows records + for i in range(nrows): + if i == 3 or i == 4: + table.row['var2'] = ((2, 2), (2, 2)) # *-* + # This injects the row values. + table.row.append() + + # We need to flush the buffers in table in order to get an + # accurate number of records on it. + table.flush() + + # Create a recarray with the same default values + buffer = [( + ["\x00"]*2, # just "" does not initialize the buffer properly + [["abcd", "efgh"]]*2, + (1, 1), + ((1, 1), (1, 1)), + 2, 3.1, 4.2, 5, "e")] + r = np.rec.array( + buffer * nrows, + formats='(2,)a4,(2,2)a4,(2,)i4,(2,2)i4,i2,f8,f4,u2,a1', + names=['var0', 'var1', 'var1_', 'var2', 'var3', 'var4', 'var5', + 'var6', 'var7']) # *-* + + # Assign the value exceptions + r["var2"][3] = ((2, 2), (2, 2)) # *-* + r["var2"][4] = ((2, 2), (2, 2)) # *-* + + # Read the table in another recarray + r2 = table.read() + + # This generates too much output. Activate only when + # self.nrowsinbuf is very small (<10) + if common.verbose and 1: + print("Table values:") + print(r2) + print("Record values:") + print(r) + + # Both checks do work, however, tobytes() seems more stringent. + self.assertEqual(r.tobytes(), r2.tobytes()) + # self.assertTrue(common.areArraysEqual(r,r2)) + + +class RecordT(tb.IsDescription): + var0 = tb.IntCol(dflt=1, shape=()) # native int + var1 = tb.IntCol(dflt=[1], shape=(1,)) # 1-D int (one element) + var2_s = tb.IntCol(dflt=[1, 1], shape=2) # 1-D int (two elements) + var2 = tb.IntCol(dflt=[1, 1], shape=(2,)) # 1-D int (two elements) + var3 = tb.IntCol(dflt=[[0, 0], [1, 1]], shape=(2, 2)) # 2-D int + + +class ShapeTestCase(common.TempFileMixin, common.PyTablesTestCase): + + def setUp(self): + super().setUp() + self.populateFile() + + def populateFile(self): + table = self.h5file.create_table(self.h5file.root, 'table', RecordT) + row = table.row + # Fill the table with some rows with default values + for i in range(1): + row.append() + + # Flush the buffer for this table + table.flush() + + def test00(self): + """Checking scalar shapes""" + + if self.reopen: + self._reopen() + table = self.h5file.root.table + + if common.verbose: + print("The values look like:", table.cols.var0[:]) + print("They should look like:", [1]) + + # The real check + self.assertEqual(table.cols.var0[:].tolist(), [1]) + + def test01(self): + """Checking undimensional (one element) shapes""" + + if self.reopen: + self._reopen() + table = self.h5file.root.table + + if common.verbose: + print("The values look like:", table.cols.var1[:]) + print("They should look like:", [[1]]) + + # The real check + self.assertEqual(table.cols.var1[:].tolist(), [[1]]) + + def test02(self): + """Checking undimensional (two elements) shapes""" + + if self.reopen: + self._reopen() + table = self.h5file.root.table + + if common.verbose: + print("The values look like:", table.cols.var2[:]) + print("They should look like:", [[1, 1]]) + + # The real check + self.assertEqual(table.cols.var2[:].tolist(), [[1, 1]]) + self.assertEqual(table.cols.var2_s[:].tolist(), [[1, 1]]) + + def test03(self): + """Checking bidimensional shapes""" + + if self.reopen: + self._reopen() + table = self.h5file.root.table + + if common.verbose: + print("The values look like:", table.cols.var3[:]) + print("They should look like:", [[[0, 0], [1, 1]]]) + + # The real check + self.assertEqual(table.cols.var3[:].tolist(), [[[0, 0], [1, 1]]]) + + +class ShapeTestCase1(ShapeTestCase): + reopen = 0 + + +class ShapeTestCase2(ShapeTestCase): + reopen = 1 + + +class SetItemTestCase(common.TempFileMixin, common.PyTablesTestCase): + + def setUp(self): + super().setUp() + + # Create a new table: + self.table = self.h5file.create_table(self.h5file.root, + 'recarray', Rec) + self.table.nrowsinbuf = self.buffersize # set buffer value + + def test01(self): + """Checking modifying one table row with __setitem__""" + + table = self.table + formats = table.description._v_nested_formats + + # append new rows + r = np.rec.array([(456, 'dbe', 1.2), (2, 'ded', 1.3)], formats=formats) + table.append(r) + table.append([(457, 'db1', 1.2), (5, 'de1', 1.3)]) + + # Modify just one existing row + table[2] = (456, 'db2', 1.2) + + # Create the modified recarray + r1 = np.rec.array([(456, 'dbe', 1.2), (2, 'ded', 1.3), + (456, 'db2', 1.2), (5, 'de1', 1.3)], + formats=formats, + names="col1,col2,col3") + # Read the modified table + if self.reopen: + self._reopen() + table = self.h5file.root.recarray + table.nrowsinbuf = self.buffersize # set buffer value + r2 = table.read() + if common.verbose: + print("Original table-->", repr(r2)) + print("Should look like-->", repr(r1)) + self.assertEqual(r1.tobytes(), r2.tobytes()) + self.assertEqual(table.nrows, 4) + + def test01b(self): + """Checking modifying one table row with __setitem__ (long index)""" + + table = self.table + formats = table.description._v_nested_formats + + # append new rows + r = np.rec.array([(456, 'dbe', 1.2), (2, 'ded', 1.3)], formats=formats) + table.append(r) + table.append([(457, 'db1', 1.2), (5, 'de1', 1.3)]) + + # Modify just one existing row + table[2] = (456, 'db2', 1.2) + + # Create the modified recarray + r1 = np.rec.array([(456, 'dbe', 1.2), (2, 'ded', 1.3), + (456, 'db2', 1.2), (5, 'de1', 1.3)], + formats=formats, + names="col1,col2,col3") + + # Read the modified table + if self.reopen: + self._reopen() + table = self.h5file.root.recarray + table.nrowsinbuf = self.buffersize # set buffer value + r2 = table.read() + if common.verbose: + print("Original table-->", repr(r2)) + print("Should look like-->", repr(r1)) + self.assertEqual(r1.tobytes(), r2.tobytes()) + self.assertEqual(table.nrows, 4) + + def test02(self): + """Modifying one row, with a step (__setitem__)""" + + table = self.table + formats = table.description._v_nested_formats + + # append new rows + r = np.rec.array([(456, 'dbe', 1.2), (2, 'ded', 1.3)], formats=formats) + table.append(r) + table.append([(457, 'db1', 1.2), (5, 'de1', 1.3)]) + + # Modify two existing rows + rows = np.rec.array([(457, 'db1', 1.2)], formats=formats) + table[1:3:2] = rows + + # Create the modified recarray + r1 = np.rec.array([(456, 'dbe', 1.2), (457, 'db1', 1.2), + (457, 'db1', 1.2), (5, 'de1', 1.3)], + formats=formats, + names="col1,col2,col3") + + # Read the modified table + if self.reopen: + self._reopen() + table = self.h5file.root.recarray + table.nrowsinbuf = self.buffersize # set buffer value + r2 = table.read() + if common.verbose: + print("Original table-->", repr(r2)) + print("Should look like-->", repr(r1)) + self.assertEqual(r1.tobytes(), r2.tobytes()) + self.assertEqual(table.nrows, 4) + + def test03(self): + """Checking modifying several rows at once (__setitem__)""" + + table = self.table + formats = table.description._v_nested_formats + + # append new rows + r = np.rec.array([(456, 'dbe', 1.2), (2, 'ded', 1.3)], formats=formats) + table.append(r) + table.append([(457, 'db1', 1.2), (5, 'de1', 1.3)]) + + # Modify two existing rows + rows = np.rec.array( + [(457, 'db1', 1.2), (5, 'de1', 1.3)], formats=formats) + + # table.modify_rows(start=1, rows=rows) + table[1:3] = rows + + # Create the modified recarray + r1 = np.rec.array([(456, 'dbe', 1.2), (457, 'db1', 1.2), + (5, 'de1', 1.3), (5, 'de1', 1.3)], + formats=formats, + names="col1,col2,col3") + + # Read the modified table + if self.reopen: + self._reopen() + table = self.h5file.root.recarray + table.nrowsinbuf = self.buffersize # set buffer value + r2 = table.read() + if common.verbose: + print("Original table-->", repr(r2)) + print("Should look like-->", repr(r1)) + self.assertEqual(r1.tobytes(), r2.tobytes()) + self.assertEqual(table.nrows, 4) + + def test04(self): + """Modifying several rows at once, with a step (__setitem__)""" + + table = self.table + formats = table.description._v_nested_formats + + # append new rows + r = np.rec.array([(456, 'dbe', 1.2), (2, 'ded', 1.3)], formats=formats) + table.append(r) + table.append([(457, 'db1', 1.2), (5, 'de1', 1.3)]) + + # Modify two existing rows + rows = np.rec.array([(457, 'db1', 1.2), (6, 'de2', 1.3)], + formats=formats) + # table[1:4:2] = rows + table[1::2] = rows + + # Create the modified recarray + r1 = np.rec.array([(456, 'dbe', 1.2), (457, 'db1', 1.2), + (457, 'db1', 1.2), (6, 'de2', 1.3)], + formats=formats, + names="col1,col2,col3") + + # Read the modified table + if self.reopen: + self._reopen() + table = self.h5file.root.recarray + table.nrowsinbuf = self.buffersize # set buffer value + r2 = table.read() + if common.verbose: + print("Original table-->", repr(r2)) + print("Should look like-->", repr(r1)) + self.assertEqual(r1.tobytes(), r2.tobytes()) + self.assertEqual(table.nrows, 4) + + def test05(self): + """Checking modifying one column (single element, __setitem__)""" + + table = self.table + formats = table.description._v_nested_formats + + # append new rows + r = np.rec.array([(456, 'dbe', 1.2), (2, 'ded', 1.3)], formats=formats) + table.append(r) + table.append([(457, 'db1', 1.2), (5, 'de1', 1.3)]) + + # Modify just one existing column + table.cols.col1[1] = -1 + + # Create the modified recarray + r1 = np.rec.array([(456, 'dbe', 1.2), (-1, 'ded', 1.3), + (457, 'db1', 1.2), (5, 'de1', 1.3)], + formats=formats, + names="col1,col2,col3") + + # Read the modified table + if self.reopen: + self._reopen() + table = self.h5file.root.recarray + table.nrowsinbuf = self.buffersize # set buffer value + r2 = table.read() + if common.verbose: + print("Original table-->", repr(r2)) + print("Should look like-->", repr(r1)) + self.assertEqual(r1.tobytes(), r2.tobytes()) + self.assertEqual(table.nrows, 4) + + def test06a(self): + """Checking modifying one column (several elements, __setitem__)""" + + table = self.table + formats = table.description._v_nested_formats + + # append new rows + r = np.rec.array([(456, 'dbe', 1.2), (2, 'ded', 1.3)], formats=formats) + table.append(r) + table.append([(457, 'db1', 1.2), (5, 'de1', 1.3)]) + + # Modify just one existing column + table.cols.col1[1:4] = [(2, 2), (3, 3), (4, 4)] + + # Create the modified recarray + r1 = np.rec.array([(456, 'dbe', 1.2), (2, 'ded', 1.3), + (3, 'db1', 1.2), (4, 'de1', 1.3)], + formats=formats, + names="col1,col2,col3") + # Read the modified table + if self.reopen: + self._reopen() + table = self.h5file.root.recarray + table.nrowsinbuf = self.buffersize # set buffer value + r2 = table.read() + if common.verbose: + print("Original table-->", repr(r2)) + print("Should look like-->", repr(r1)) + self.assertEqual(r1.tobytes(), r2.tobytes()) + self.assertEqual(table.nrows, 4) + + def test06b(self): + """Checking modifying one column (iterator, __setitem__)""" + + table = self.table + formats = table.description._v_nested_formats + + # append new rows + r = np.rec.array([(456, 'dbe', 1.2), (2, 'ded', 1.3)], formats=formats) + table.append(r) + table.append([(457, 'db1', 1.2), (5, 'de1', 1.3)]) + + # Modify just one existing column + with self.assertRaises(NotImplementedError): + for row in table.iterrows(): + row['col1'] = row.nrow + 1 + row.append() + table.flush() + + def test07(self): + """Modifying one column (several elements, __setitem__, step)""" + + table = self.table + formats = table.description._v_nested_formats + + # append new rows + r = np.rec.array([(456, 'dbe', 1.2), (1, 'ded', 1.3)], formats=formats) + table.append(r) + table.append([(457, 'db1', 1.2), (5, 'de1', 1.3)]) + + # Modify just one existing column + table.cols.col1[1:4:2] = [(2, 2), (3, 3)] + + # Create the modified recarray + r1 = np.rec.array([(456, 'dbe', 1.2), (2, 'ded', 1.3), + (457, 'db1', 1.2), (3, 'de1', 1.3)], + formats=formats, + names="col1,col2,col3") + + # Read the modified table + if self.reopen: + self._reopen() + table = self.h5file.root.recarray + table.nrowsinbuf = self.buffersize # set buffer value + r2 = table.read() + if common.verbose: + print("Original table-->", repr(r2)) + print("Should look like-->", repr(r1)) + self.assertEqual(r1.tobytes(), r2.tobytes()) + self.assertEqual(table.nrows, 4) + + def test08(self): + """Modifying one column (one element, __setitem__, step)""" + + table = self.table + formats = table.description._v_nested_formats + + # append new rows + r = np.rec.array([(456, 'dbe', 1.2), (2, 'ded', 1.3)], formats=formats) + table.append(r) + table.append([(457, 'db1', 1.2), (5, 'de1', 1.3)]) + + # Modify just one existing column + table.cols.col1[1:4:3] = [(2, 2)] + + # Create the modified recarray + r1 = np.rec.array([(456, 'dbe', 1.2), (2, 'ded', 1.3), + (457, 'db1', 1.2), (5, 'de1', 1.3)], + formats=formats, + names="col1,col2,col3") + + # Read the modified table + if self.reopen: + self._reopen() + table = self.h5file.root.recarray + table.nrowsinbuf = self.buffersize # set buffer value + r2 = table.read() + if common.verbose: + print("Original table-->", repr(r2)) + print("Should look like-->", repr(r1)) + self.assertEqual(r1.tobytes(), r2.tobytes()) + self.assertEqual(table.nrows, 4) + + def test09(self): + """Modifying beyond the table extend (__setitem__, step)""" + + table = self.table + formats = table.description._v_nested_formats + + # append new rows + r = np.rec.array([(456, 'dbe', 1.2), (2, 'ded', 1.3)], formats=formats) + table.append(r) + table.append([(457, 'db1', 1.2), (5, 'de1', 1.3)]) + + # Try to modify beyond the extend + # This will silently exclude the non-fitting rows + rows = np.rec.array([(457, 'db1', 1.2), (6, 'de2', 1.3)], + formats=formats) + table[1::2] = rows + + # How it should look like + r1 = np.rec.array([(456, 'dbe', 1.2), (457, 'db1', 1.2), + (457, 'db1', 1.2), (6, 'de2', 1.3)], + formats=formats) + + # Read the modified table + if self.reopen: + self._reopen() + table = self.h5file.root.recarray + table.nrowsinbuf = self.buffersize # set buffer value + r2 = table.read() + if common.verbose: + print("Original table-->", repr(r2)) + print("Should look like-->", repr(r1)) + self.assertEqual(r1.tobytes(), r2.tobytes()) + self.assertEqual(table.nrows, 4) + + +class SetItemTestCase1(SetItemTestCase): + reopen = 0 + buffersize = 1 + + +class SetItemTestCase2(SetItemTestCase): + reopen = 1 + buffersize = 2 + + +class SetItemTestCase3(SetItemTestCase): + reopen = 0 + buffersize = 1000 + + +class SetItemTestCase4(SetItemTestCase): + reopen = 1 + buffersize = 1000 + + +class UpdateRowTestCase(common.TempFileMixin, common.PyTablesTestCase): + + def setUp(self): + super().setUp() + + # Create a new table: + self.table = self.h5file.create_table(self.h5file.root, + 'recarray', Rec) + self.table.nrowsinbuf = self.buffersize # set buffer value + + def test01(self): + """Checking modifying one table row with Row.update""" + + table = self.table + formats = table.description._v_nested_formats + + # append new rows + r = np.rec.array([(456, 'dbe', 1.2), (2, 'ded', 1.3)], + formats=formats) + table.append(r) + table.append([(457, 'db1', 1.2), (5, 'de1', 1.3)]) + + # Modify just one existing row + for row in table.iterrows(2, 3): + (row['col1'], row['col2'], row['col3']) = [456, 'db2', 1.2] + row.update() + + # Create the modified recarray + r1 = np.rec.array([(456, 'dbe', 1.2), (2, 'ded', 1.3), + (456, 'db2', 1.2), (5, 'de1', 1.3)], + formats=formats, + names="col1,col2,col3") + + # Read the modified table + if self.reopen: + self._reopen() + table = self.h5file.root.recarray + table.nrowsinbuf = self.buffersize # set buffer value + r2 = table.read() + if common.verbose: + print("Original table-->", repr(r2)) + print("Should look like-->", repr(r1)) + self.assertEqual(r1.tobytes(), r2.tobytes()) + self.assertEqual(table.nrows, 4) + + def test02(self): + """Modifying one row, with a step (Row.update)""" + + table = self.table + formats = table.description._v_nested_formats + + # append new rows + r = np.rec.array([(456, 'dbe', 1.2), (2, 'ded', 1.3)], formats=formats) + table.append(r) + table.append([(457, 'db1', 1.2), (5, 'de1', 1.3)]) + + # Modify two existing rows + for row in table.iterrows(1, 3, 2): + if row.nrow == 1: + (row['col1'], row['col2'], row['col3']) = (457, 'db1', 1.2) + elif row.nrow == 3: + (row['col1'], row['col2'], row['col3']) = (6, 'de2', 1.3) + row.update() + + # Create the modified recarray + r1 = np.rec.array([(456, 'dbe', 1.2), (457, 'db1', 1.2), + (457, 'db1', 1.2), (5, 'de1', 1.3)], + formats=formats, + names="col1,col2,col3") + + # Read the modified table + if self.reopen: + self._reopen() + table = self.h5file.root.recarray + table.nrowsinbuf = self.buffersize # set buffer value + r2 = table.read() + if common.verbose: + print("Original table-->", repr(r2)) + print("Should look like-->", repr(r1)) + self.assertEqual(r1.tobytes(), r2.tobytes()) + self.assertEqual(table.nrows, 4) + + def test03(self): + """Checking modifying several rows at once (Row.update)""" + + table = self.table + formats = table.description._v_nested_formats + + # append new rows + r = np.rec.array([(456, 'dbe', 1.2), (2, 'ded', 1.3)], formats=formats) + table.append(r) + table.append([(457, 'db1', 1.2), (5, 'de1', 1.3)]) + + # Modify two existing rows + for row in table.iterrows(1, 3): + if row.nrow == 1: + (row['col1'], row['col2'], row['col3']) = (457, 'db1', 1.2) + elif row.nrow == 2: + (row['col1'], row['col2'], row['col3']) = (5, 'de1', 1.3) + row.update() + + # Create the modified recarray + r1 = np.rec.array([(456, 'dbe', 1.2), (457, 'db1', 1.2), + (5, 'de1', 1.3), (5, 'de1', 1.3)], + formats=formats, + names="col1,col2,col3") + + # Read the modified table + if self.reopen: + self._reopen() + table = self.h5file.root.recarray + table.nrowsinbuf = self.buffersize # set buffer value + r2 = table.read() + if common.verbose: + print("Original table-->", repr(r2)) + print("Should look like-->", repr(r1)) + self.assertEqual(r1.tobytes(), r2.tobytes()) + self.assertEqual(table.nrows, 4) + + def test04(self): + """Modifying several rows at once, with a step (Row.update)""" + + table = self.table + formats = table.description._v_nested_formats + + # append new rows + r = np.rec.array([(456, 'dbe', 1.2), (2, 'ded', 1.3)], formats=formats) + table.append(r) + table.append([(457, 'db1', 1.2), (5, 'de1', 1.3)]) + + # Modify two existing rows + for row in table.iterrows(1, stop=4, step=2): + if row.nrow == 1: + (row['col1'], row['col2'], row['col3']) = (457, 'db1', 1.2) + elif row.nrow == 3: + (row['col1'], row['col2'], row['col3']) = (6, 'de2', 1.3) + row.update() + + # Create the modified recarray + r1 = np.rec.array([(456, 'dbe', 1.2), (457, 'db1', 1.2), + (457, 'db1', 1.2), (6, 'de2', 1.3)], + formats=formats, + names="col1,col2,col3") + + # Read the modified table + if self.reopen: + self._reopen() + table = self.h5file.root.recarray + table.nrowsinbuf = self.buffersize # set buffer value + r2 = table.read() + if common.verbose: + print("Original table-->", repr(r2)) + print("Should look like-->", repr(r1)) + self.assertEqual(r1.tobytes(), r2.tobytes()) + self.assertEqual(table.nrows, 4) + + def test05(self): + """Checking modifying one column (single element, Row.update)""" + + table = self.table + formats = table.description._v_nested_formats + + # append new rows + r = np.rec.array([(456, 'dbe', 1.2), (2, 'ded', 1.3)], formats=formats) + table.append(r) + table.append([(457, 'db1', 1.2), (5, 'de1', 1.3)]) + + # Modify just one existing column + for row in table.iterrows(1, 2): + row['col1'] = -1 + row.update() + + # Create the modified recarray + r1 = np.rec.array([(456, 'dbe', 1.2), (-1, 'ded', 1.3), + (457, 'db1', 1.2), (5, 'de1', 1.3)], + formats=formats, + names="col1,col2,col3") + + # Read the modified table + if self.reopen: + self._reopen() + table = self.h5file.root.recarray + table.nrowsinbuf = self.buffersize # set buffer value + r2 = table.read() + if common.verbose: + print("Original table-->", repr(r2)) + print("Should look like-->", repr(r1)) + self.assertEqual(r1.tobytes(), r2.tobytes()) + self.assertEqual(table.nrows, 4) + + def test06(self): + """Checking modifying one column (several elements, Row.update)""" + + table = self.table + formats = table.description._v_nested_formats + + # append new rows + r = np.rec.array([(456, 'dbe', 1.2), (2, 'ded', 1.3)], formats=formats) + table.append(r) + table.append([(457, 'db1', 1.2), (5, 'de1', 1.3)]) + + # Modify just one existing column + for row in table.iterrows(1, 4): + row['col1'] = row.nrow + 1 + row.update() + + # Create the modified recarray + r1 = np.rec.array([(456, 'dbe', 1.2), (2, 'ded', 1.3), + (3, 'db1', 1.2), (4, 'de1', 1.3)], + formats=formats, + names="col1,col2,col3") + + # Read the modified table + if self.reopen: + self._reopen() + table = self.h5file.root.recarray + table.nrowsinbuf = self.buffersize # set buffer value + r2 = table.read() + if common.verbose: + print("Original table-->", repr(r2)) + print("Should look like-->", repr(r1)) + self.assertEqual(r1.tobytes(), r2.tobytes()) + self.assertEqual(table.nrows, 4) + + def test07(self): + """Modifying values from a selection""" + + table = self.table + formats = table.description._v_nested_formats + + # append new rows + r = np.rec.array([(456, 'dbe', 1.2), (1, 'ded', 1.3)], formats=formats) + table.append(r) + table.append([(457, 'db1', 1.2), (5, 'de1', 1.3)]) + + # Modify just rows with col1 < 456 + for row in table.iterrows(): + if row['col1'][0] < 456: + row['col1'] = 2 + row['col2'] = 'ada' + row.update() + + # Create the modified recarray + r1 = np.rec.array([(456, 'dbe', 1.2), (2, 'ada', 1.3), + (457, 'db1', 1.2), (2, 'ada', 1.3)], + formats=formats, + names="col1,col2,col3") + + # Read the modified table + if self.reopen: + self._reopen() + table = self.h5file.root.recarray + table.nrowsinbuf = self.buffersize # set buffer value + r2 = table.read() + if common.verbose: + print("Original table-->", repr(r2)) + print("Should look like-->", repr(r1)) + self.assertEqual(r1.tobytes(), r2.tobytes()) + self.assertEqual(table.nrows, 4) + + def test08(self): + """Modifying a large table (Row.update)""" + + table = self.table + formats = table.description._v_nested_formats + + nrows = 100 + + # append new rows + row = table.row + for i in range(nrows): + row['col1'] = i-1 + row['col2'] = 'a'+str(i-1) + row['col3'] = -1.0 + row.append() + table.flush() + + # Modify all the rows + for row in table.iterrows(): + row['col1'] = row.nrow + row['col2'] = 'b'+str(row.nrow) + row['col3'] = 0.0 + row.update() + + # Create the modified recarray + r1 = np.rec.array( + None, shape=nrows, formats=formats, names="col1,col2,col3") + for i in range(nrows): + r1['col1'][i] = i + r1['col2'][i] = 'b'+str(i) + r1['col3'][i] = 0.0 + + # Read the modified table + if self.reopen: + self._reopen() + table = self.h5file.root.recarray + table.nrowsinbuf = self.buffersize # set buffer value + r2 = table.read() + if common.verbose: + print("Original table-->", repr(r2)) + print("Should look like-->", repr(r1)) + self.assertEqual(r1.tobytes(), r2.tobytes()) + self.assertEqual(table.nrows, nrows) + + def test08b(self): + """Setting values on a large table without calling Row.update""" + + table = self.table + formats = table.description._v_nested_formats + + nrows = 100 + + # append new rows + row = table.row + for i in range(nrows): + row['col1'] = i-1 + row['col2'] = 'a'+str(i-1) + row['col3'] = -1.0 + row.append() + table.flush() + + # Modify all the rows (actually don't) + for row in table.iterrows(): + row['col1'] = row.nrow + row['col2'] = 'b'+str(row.nrow) + row['col3'] = 0.0 + # row.update() + + # Create the modified recarray + r1 = np.rec.array( + None, shape=nrows, formats=formats, names="col1,col2,col3") + for i in range(nrows): + r1['col1'][i] = i-1 + r1['col2'][i] = 'a'+str(i-1) + r1['col3'][i] = -1.0 + + # Read the modified table + if self.reopen: + self._reopen() + table = self.h5file.root.recarray + table.nrowsinbuf = self.buffersize # set buffer value + r2 = table.read() + if common.verbose: + print("Original table-->", repr(r2)) + print("Should look like-->", repr(r1)) + self.assertEqual(r1.tobytes(), r2.tobytes()) + self.assertEqual(table.nrows, nrows) + + def test09(self): + """Modifying selected values on a large table""" + + table = self.table + formats = table.description._v_nested_formats + + nrows = 100 + + # append new rows + row = table.row + for i in range(nrows): + row['col1'] = i-1 + row['col2'] = 'a'+str(i-1) + row['col3'] = -1.0 + row.append() + table.flush() + + # Modify selected rows + for row in table.iterrows(): + if row['col1'][0] > nrows-3: + row['col1'] = row.nrow + row['col2'] = 'b'+str(row.nrow) + row['col3'] = 0.0 + row.update() + + # Create the modified recarray + r1 = np.rec.array( + None, shape=nrows, formats=formats, names="col1,col2,col3") + for i in range(nrows): + r1['col1'][i] = i-1 + r1['col2'][i] = 'a'+str(i-1) + r1['col3'][i] = -1.0 + + # modify just the last line + r1['col1'][i] = i + r1['col2'][i] = 'b'+str(i) + r1['col3'][i] = 0.0 + + # Read the modified table + if self.reopen: + self._reopen() + table = self.h5file.root.recarray + table.nrowsinbuf = self.buffersize # set buffer value + r2 = table.read() + if common.verbose: + print("Original table-->", repr(r2)) + print("Should look like-->", repr(r1)) + self.assertEqual(r1.tobytes(), r2.tobytes()) + self.assertEqual(table.nrows, nrows) + + def test09b(self): + """Modifying selected values on a large table (alternate values)""" + + table = self.table + formats = table.description._v_nested_formats + + nrows = 100 + + # append new rows + row = table.row + for i in range(nrows): + row['col1'] = i-1 + row['col2'] = 'a'+str(i-1) + row['col3'] = -1.0 + row.append() + table.flush() + + # Modify selected rows + for row in table.iterrows(step=10): + row['col1'] = row.nrow + row['col2'] = 'b'+str(row.nrow) + row['col3'] = 0.0 + row.update() + + # Create the modified recarray + r1 = np.rec.array( + None, shape=nrows, formats=formats, names="col1,col2,col3") + for i in range(nrows): + if i % 10 > 0: + r1['col1'][i] = i-1 + r1['col2'][i] = 'a'+str(i-1) + r1['col3'][i] = -1.0 + else: + r1['col1'][i] = i + r1['col2'][i] = 'b'+str(i) + r1['col3'][i] = 0.0 + + # Read the modified table + if self.reopen: + self._reopen() + table = self.h5file.root.recarray + table.nrowsinbuf = self.buffersize # set buffer value + r2 = table.read() + if common.verbose: + print("Original table-->", repr(r2)) + print("Should look like-->", repr(r1)) + self.assertEqual(r1.tobytes(), r2.tobytes()) + self.assertEqual(table.nrows, nrows) + + +class UpdateRowTestCase1(UpdateRowTestCase): + reopen = 0 + buffersize = 1 + + +class UpdateRowTestCase2(UpdateRowTestCase): + reopen = 1 + buffersize = 2 + + +class UpdateRowTestCase3(UpdateRowTestCase): + reopen = 0 + buffersize = 1000 + + +class UpdateRowTestCase4(UpdateRowTestCase): + reopen = 1 + buffersize = 1000 + + +def suite(): + theSuite = common.unittest.TestSuite() + niter = 1 + # common.heavy = 1 # Uncomment this only for testing purposes + + for n in range(niter): + theSuite.addTest(common.unittest.makeSuite(BasicWriteTestCase)) + theSuite.addTest(common.unittest.makeSuite(DictWriteTestCase)) + theSuite.addTest(common.unittest.makeSuite(RecordDTWriteTestCase)) + theSuite.addTest(common.unittest.makeSuite(NumPyDTWriteTestCase)) + theSuite.addTest(common.unittest.makeSuite(RecArrayOneWriteTestCase)) + theSuite.addTest(common.unittest.makeSuite(RecArrayTwoWriteTestCase)) + theSuite.addTest(common.unittest.makeSuite(RecArrayThreeWriteTestCase)) + theSuite.addTest( + common.unittest.makeSuite(RecArrayAlignedWriteTestCase)) + theSuite.addTest(common.unittest.makeSuite(CompressZLIBTablesTestCase)) + theSuite.addTest(common.unittest.makeSuite(CompressTwoTablesTestCase)) + theSuite.addTest(common.unittest.makeSuite(IterRangeTestCase)) + theSuite.addTest(common.unittest.makeSuite(RecArrayRangeTestCase)) + theSuite.addTest(common.unittest.makeSuite(GetColRangeTestCase)) + theSuite.addTest(common.unittest.makeSuite(DefaultValues)) + theSuite.addTest(common.unittest.makeSuite(RecArrayIO)) + theSuite.addTest(common.unittest.makeSuite(ShapeTestCase1)) + theSuite.addTest(common.unittest.makeSuite(ShapeTestCase2)) + theSuite.addTest(common.unittest.makeSuite(SetItemTestCase1)) + theSuite.addTest(common.unittest.makeSuite(SetItemTestCase2)) + theSuite.addTest(common.unittest.makeSuite(SetItemTestCase3)) + theSuite.addTest(common.unittest.makeSuite(SetItemTestCase4)) + theSuite.addTest(common.unittest.makeSuite(UpdateRowTestCase1)) + theSuite.addTest(common.unittest.makeSuite(UpdateRowTestCase2)) + theSuite.addTest(common.unittest.makeSuite(UpdateRowTestCase3)) + theSuite.addTest(common.unittest.makeSuite(UpdateRowTestCase4)) + theSuite.addTest( + common.unittest.makeSuite(CompressBloscTablesTestCase)) + theSuite.addTest(common.unittest.makeSuite(CompressLZOTablesTestCase)) + if common.heavy: + theSuite.addTest( + common.unittest.makeSuite(CompressBzip2TablesTestCase)) + theSuite.addTest(common.unittest.makeSuite(BigTablesTestCase)) + + return theSuite + + +if __name__ == '__main__': + common.parse_argv(sys.argv) + common.print_versions() + common.unittest.main(defaultTest='suite') diff --git a/tables/tests/test_timestamps.py b/tables/tests/test_timestamps.py new file mode 100644 index 0000000..1d4528c --- /dev/null +++ b/tables/tests/test_timestamps.py @@ -0,0 +1,172 @@ +"""This test unit checks control of dataset timestamps with track_times. + +""" + +import hashlib +import sys +import time +from pathlib import Path + +import tables as tb +from tables.tests import common + +HEXDIGEST = '2aafb84ab739bb4ae61d2939dc010bfd' + + +class Record(tb.IsDescription): + var1 = tb.StringCol(itemsize=4) # 4-character String + var2 = tb.IntCol() # integer + var3 = tb.Int16Col() # short integer + + +class TrackTimesMixin: + def _add_datasets(self, group, j, track_times): + # Create a table + table = self.h5file.create_table(group, f'table{j}', + Record, + title=self.title, + filters=None, + track_times=track_times) + # Get the record object associated with the new table + d = table.row + # Fill the table + for i in range(self.nrows): + d['var1'] = '%04d' % (self.nrows - i) + d['var2'] = i + d['var3'] = i * 2 + d.append() # This injects the Record values + # Flush the buffer for this table + table.flush() + + # Create a couple of arrays in each group + var1List = [x['var1'] for x in table.iterrows()] + var3List = [x['var3'] for x in table.iterrows()] + + self.h5file.create_array(group, f'array{j}', + var1List, f"col {j}", + track_times=track_times) + + # Create CArrays as well + self.h5file.create_carray(group, name=f'carray{j}', + obj=var3List, + title="col {}".format(j + 2), + track_times=track_times) + + # Create EArrays as well + ea = self.h5file.create_earray(group, f'earray{j}', + tb.StringAtom(itemsize=4), (0,), + "col {}".format(j + 4), + track_times=track_times) + # And fill them with some values + ea.append(var1List) + + # Finally VLArrays too + vla = self.h5file.create_vlarray(group, f'vlarray{j}', + tb.Int16Atom(), + "col {}".format(j + 6), + track_times=track_times) + # And fill them with some values + vla.append(var3List) + + +class TimestampTestCase(TrackTimesMixin, common.TempFileMixin, + common.PyTablesTestCase): + title = "A title" + nrows = 10 + + def setUp(self): + super().setUp() + self.populateFile() + + def populateFile(self): + group = self.h5file.root + for j in range(4): + track_times = bool(j % 2) + self._add_datasets(group, j, track_times) + + def test00_checkTimestamps(self): + """Checking retrieval of timestamps""" + + for pattern in ('/table{}', + '/array{}', + '/carray{}', + '/earray{}', + '/vlarray{}'): + # Verify that: + # - if track_times was False, ctime is 0 + # - if track_times was True, ctime is not 0, + # and has either stayed the same or incremented + tracked_ctimes = [] + for j in range(4): + track_times = bool(j % 2) + node = pattern.format(j) + obj = self.h5file.get_node(node) + # Test property retrieval + self.assertEqual(obj.track_times, track_times) + timestamps = obj._get_obj_timestamps() + self.assertEqual(timestamps.atime, 0) + self.assertEqual(timestamps.mtime, 0) + self.assertEqual(timestamps.btime, 0) + if not track_times: + self.assertEqual(timestamps.ctime, 0) + else: + self.assertNotEqual(timestamps.ctime, 0) + tracked_ctimes.append(timestamps.ctime) + self.assertGreaterEqual(tracked_ctimes[1], tracked_ctimes[0]) + + +class BitForBitTestCase(TrackTimesMixin, common.TempFileMixin, + common.PyTablesTestCase): + title = "A title" + nrows = 10 + + def repopulateFile(self, track_times): + self.h5file.close() + self.h5file = tb.open_file(self.h5fname, mode="w") + group = self.h5file.root + self._add_datasets(group, 1, track_times) + self.h5file.close() + + def test00_checkReproducibility(self): + """Checking bit-for-bit reproducibility with no track_times""" + + self.repopulateFile(track_times=False) + hexdigest_wo_track_1 = self._get_digest(self.h5fname) + self.repopulateFile(track_times=True) + hexdigest_w_track_1 = self._get_digest(self.h5fname) + time.sleep(1) + self.repopulateFile(track_times=True) + hexdigest_w_track_2 = self._get_digest(self.h5fname) + self.repopulateFile(track_times=False) + hexdigest_wo_track_2 = self._get_digest(self.h5fname) + self.assertEqual(HEXDIGEST, hexdigest_wo_track_1) + self.assertEqual(hexdigest_wo_track_1, hexdigest_wo_track_2) + self.assertNotEqual(hexdigest_wo_track_1, hexdigest_w_track_1) + self.assertNotEqual(hexdigest_w_track_1, hexdigest_w_track_2) + + def _get_digest(self, filename): + md5 = hashlib.md5() + for data in Path(filename).read_bytes(): + md5.update(data) + + hexdigest = md5.hexdigest() + + return hexdigest + + +def suite(): + theSuite = common.unittest.TestSuite() + niter = 1 + # common.heavy = 1 # Uncomment this only for testing purposes! + + for i in range(niter): + theSuite.addTest(common.unittest.makeSuite(TimestampTestCase)) + theSuite.addTest(common.unittest.makeSuite(BitForBitTestCase)) + + return theSuite + + +if __name__ == '__main__': + common.parse_argv(sys.argv) + common.print_versions() + common.unittest.main(defaultTest='suite') diff --git a/tables/tests/test_timetype.py b/tables/tests/test_timetype.py new file mode 100644 index 0000000..9eae207 --- /dev/null +++ b/tables/tests/test_timetype.py @@ -0,0 +1,514 @@ +"""Unit test for the Time datatypes.""" + + +import numpy as np + +import tables as tb +from tables.tests import common + + +class LeafCreationTestCase(common.TempFileMixin, common.PyTablesTestCase): + """Tests creating Tables, VLArrays an EArrays with Time data.""" + + def test00_UnidimLeaves(self): + """Creating new nodes with unidimensional time elements.""" + + # Table creation. + class MyTimeRow(tb.IsDescription): + intcol = tb.IntCol() + t32col = tb.Time32Col() + t64col = tb.Time64Col() + + self.h5file.create_table('/', 'table', MyTimeRow) + + # VLArray creation. + self.h5file.create_vlarray('/', 'vlarray4', tb.Time32Atom()) + self.h5file.create_vlarray('/', 'vlarray8', tb.Time64Atom()) + + # EArray creation. + self.h5file.create_earray('/', 'earray4', tb.Time32Atom(), shape=(0,)) + self.h5file.create_earray('/', 'earray8', tb.Time64Atom(), shape=(0,)) + + def test01_MultidimLeaves(self): + """Creating new nodes with multidimensional time elements.""" + + # Table creation. + class MyTimeRow(tb.IsDescription): + intcol = tb.IntCol(shape=(2, 1)) + t32col = tb.Time32Col(shape=(2, 1)) + t64col = tb.Time64Col(shape=(2, 1)) + self.h5file.create_table('/', 'table', MyTimeRow) + + # VLArray creation. + self.h5file.create_vlarray( + '/', 'vlarray4', tb.Time32Atom(shape=(2, 1))) + self.h5file.create_vlarray( + '/', 'vlarray8', tb.Time64Atom(shape=(2, 1))) + + # EArray creation. + self.h5file.create_earray( + '/', 'earray4', tb.Time32Atom(), shape=(0, 2, 1)) + self.h5file.create_earray( + '/', 'earray8', tb.Time64Atom(), shape=(0, 2, 1)) + + +class OpenTestCase(common.TempFileMixin, common.PyTablesTestCase): + """Tests opening a file with Time nodes.""" + + # The description used in the test Table. + class MyTimeRow(tb.IsDescription): + t32col = tb.Time32Col(shape=(2, 1)) + t64col = tb.Time64Col(shape=(2, 1)) + + # The atoms used in the test VLArrays. + myTime32Atom = tb.Time32Atom(shape=(2, 1)) + myTime64Atom = tb.Time64Atom(shape=(2, 1)) + + def setUp(self): + super().setUp() + + # Create test Table. + self.h5file.create_table('/', 'table', self.MyTimeRow) + + # Create test VLArrays. + self.h5file.create_vlarray('/', 'vlarray4', self.myTime32Atom) + self.h5file.create_vlarray('/', 'vlarray8', self.myTime64Atom) + + self._reopen() + + def test00_OpenFile(self): + """Opening a file with Time nodes.""" + + # Test the Table node. + tbl = self.h5file.root.table + self.assertEqual( + tbl.coldtypes['t32col'], + self.MyTimeRow.columns['t32col'].dtype, + "Column dtypes do not match.") + self.assertEqual( + tbl.coldtypes['t64col'], + self.MyTimeRow.columns['t64col'].dtype, + "Column dtypes do not match.") + + # Test the VLArray nodes. + vla4 = self.h5file.root.vlarray4 + self.assertEqual( + vla4.atom.dtype, self.myTime32Atom.dtype, + "Atom types do not match.") + self.assertEqual( + vla4.atom.shape, self.myTime32Atom.shape, + "Atom shapes do not match.") + + vla8 = self.h5file.root.vlarray8 + self.assertEqual( + vla8.atom.dtype, self.myTime64Atom.dtype, + "Atom types do not match.") + self.assertEqual( + vla8.atom.shape, self.myTime64Atom.shape, + "Atom shapes do not match.") + + def test01_OpenFileStype(self): + """Opening a file with Time nodes, comparing Atom.stype.""" + + # Test the Table node. + tbl = self.h5file.root.table + self.assertEqual( + tbl.coltypes['t32col'], + self.MyTimeRow.columns['t32col'].type, + "Column types do not match.") + self.assertEqual( + tbl.coltypes['t64col'], + self.MyTimeRow.columns['t64col'].type, + "Column types do not match.") + + # Test the VLArray nodes. + vla4 = self.h5file.root.vlarray4 + self.assertEqual( + vla4.atom.type, self.myTime32Atom.type, + "Atom types do not match.") + + vla8 = self.h5file.root.vlarray8 + self.assertEqual( + vla8.atom.type, self.myTime64Atom.type, + "Atom types do not match.") + + +class CompareTestCase(common.TempFileMixin, common.PyTablesTestCase): + """Tests whether stored and retrieved time data is kept the same.""" + + # The description used in the test Table. + class MyTimeRow(tb.IsDescription): + t32col = tb.Time32Col(pos=0) + t64col = tb.Time64Col(shape=(2,), pos=1) + + # The atoms used in the test VLArrays. + myTime32Atom = tb.Time32Atom(shape=(2,)) + myTime64Atom = tb.Time64Atom(shape=(2,)) + + def test00_Compare32VLArray(self): + """Comparing written 32-bit time data with read data in a VLArray.""" + + wtime = np.array((1_234_567_890,) * 2, np.int32) + + # Create test VLArray with data. + vla = self.h5file.create_vlarray('/', 'test', self.myTime32Atom) + vla.append(wtime) + self._reopen() + + # Check the written data. + rtime = self.h5file.root.test.read()[0][0] + self.h5file.close() + self.assertTrue(common.allequal(rtime, wtime), + "Stored and retrieved values do not match.") + + def test01_Compare64VLArray(self): + """Comparing written 64-bit time data with read data in a VLArray.""" + + wtime = np.array((1_234_567_890.123456,) * 2, np.float64) + + # Create test VLArray with data. + vla = self.h5file.create_vlarray('/', 'test', self.myTime64Atom) + vla.append(wtime) + self._reopen() + + # Check the written data. + rtime = self.h5file.root.test.read()[0][0] + self.h5file.close() + self.assertTrue(common.allequal(rtime, wtime), + "Stored and retrieved values do not match.") + + def test01b_Compare64VLArray(self): + """Comparing several written and read 64-bit time values in a + VLArray.""" + + # Create test VLArray with data. + vla = self.h5file.create_vlarray('/', 'test', self.myTime64Atom) + + # Size of the test. + nrows = vla.nrowsinbuf + 34 # Add some more rows than buffer. + # Only for home checks; the value above should check better + # the I/O with multiple buffers. + # nrows = 10 + + for i in range(nrows): + j = i * 2 + vla.append((j + 0.012, j + 1 + 0.012)) + self._reopen() + + # Check the written data. + arr = self.h5file.root.test.read() + self.h5file.close() + + arr = np.array(arr) + orig_val = np.arange(0, nrows * 2, dtype=np.int32) + 0.012 + orig_val.shape = (nrows, 1, 2) + if common.verbose: + print("Original values:", orig_val) + print("Retrieved values:", arr) + self.assertTrue(common.allequal(arr, orig_val), + "Stored and retrieved values do not match.") + + def test02_CompareTable(self): + """Comparing written time data with read data in a Table.""" + + wtime = 1_234_567_890.123456 + + # Create test Table with data. + tbl = self.h5file.create_table('/', 'test', self.MyTimeRow) + row = tbl.row + row['t32col'] = int(wtime) + row['t64col'] = (wtime, wtime) + row.append() + self._reopen() + + # Check the written data. + recarr = self.h5file.root.test.read(0) + self.h5file.close() + + self.assertEqual(recarr['t32col'][0], int(wtime), + "Stored and retrieved values do not match.") + + comp = (recarr['t64col'][0] == np.array((wtime, wtime))) + self.assertTrue(np.alltrue(comp), + "Stored and retrieved values do not match.") + + def test02b_CompareTable(self): + """Comparing several written and read time values in a Table.""" + + # Create test Table with data. + tbl = self.h5file.create_table('/', 'test', self.MyTimeRow) + + # Size of the test. + nrows = tbl.nrowsinbuf + 34 # Add some more rows than buffer. + # Only for home checks; the value above should check better + # the I/O with multiple buffers. + # nrows = 10 + + row = tbl.row + for i in range(nrows): + row['t32col'] = i + j = i * 2 + row['t64col'] = (j + 0.012, j+1+0.012) + row.append() + + self._reopen() + + # Check the written data. + recarr = self.h5file.root.test.read() + self.h5file.close() + + # Time32 column. + orig_val = np.arange(nrows, dtype=np.int32) + if common.verbose: + print("Original values:", orig_val) + print("Retrieved values:", recarr['t32col'][:]) + self.assertTrue(np.alltrue(recarr['t32col'][:] == orig_val), + "Stored and retrieved values do not match.") + + # Time64 column. + orig_val = np.arange(0, nrows * 2, dtype=np.int32) + 0.012 + orig_val.shape = (nrows, 2) + if common.verbose: + print("Original values:", orig_val) + print("Retrieved values:", recarr['t64col'][:]) + self.assertTrue( + common.allequal(recarr['t64col'][:], orig_val, np.float64), + "Stored and retrieved values do not match.") + + def test03_Compare64EArray(self): + """Comparing written 64-bit time data with read data in an EArray.""" + + wtime = 1_234_567_890.123456 + + # Create test EArray with data. + ea = self.h5file.create_earray( + '/', 'test', tb.Time64Atom(), shape=(0,)) + ea.append((wtime,)) + self._reopen() + + # Check the written data. + rtime = self.h5file.root.test[0] + self.h5file.close() + self.assertTrue(common.allequal(rtime, wtime), + "Stored and retrieved values do not match.") + + def test03b_Compare64EArray(self): + """Comparing several written and read 64-bit time values in an + EArray.""" + + # Create test EArray with data. + ea = self.h5file.create_earray('/', 'test', tb.Time64Atom(), + shape=(0, 2)) + + # Size of the test. + nrows = ea.nrowsinbuf + 34 # Add some more rows than buffer. + # Only for home checks; the value above should check better + # the I/O with multiple buffers. + # nrows = 10 + + for i in range(nrows): + j = i * 2 + ea.append(((j + 0.012, j + 1 + 0.012),)) + self._reopen() + + # Check the written data. + arr = self.h5file.root.test.read() + self.h5file.close() + + orig_val = np.arange(0, nrows * 2, dtype=np.int32) + 0.012 + orig_val.shape = (nrows, 2) + if common.verbose: + print("Original values:", orig_val) + print("Retrieved values:", arr) + self.assertTrue(common.allequal(arr, orig_val), + "Stored and retrieved values do not match.") + + +class UnalignedTestCase(common.TempFileMixin, common.PyTablesTestCase): + """Tests writing and reading unaligned time values in a table.""" + + # The description used in the test Table. + # Time fields are unaligned because of 'i8col'. + class MyTimeRow(tb.IsDescription): + i8col = tb.Int8Col(pos=0) + t32col = tb.Time32Col(pos=1) + t64col = tb.Time64Col(shape=(2,), pos=2) + + def test00_CompareTable(self): + """Comparing written unaligned time data with read data in a Table.""" + + # Create test Table with data. + tbl = self.h5file.create_table('/', 'test', self.MyTimeRow) + + # Size of the test. + nrows = tbl.nrowsinbuf + 34 # Add some more rows than buffer. + # Only for home checks; the value above should check better + # the I/O with multiple buffers. + # nrows = 10 + + row = tbl.row + for i in range(nrows): + row['i8col'] = i + row['t32col'] = i + j = i * 2 + row['t64col'] = (j + 0.012, j+1+0.012) + row.append() + + self._reopen() + + # Check the written data. + recarr = self.h5file.root.test.read() + self.h5file.close() + + # Int8 column. + orig_val = np.arange(nrows, dtype=np.int8) + if common.verbose: + print("Original values:", orig_val) + print("Retrieved values:", recarr['i8col'][:]) + self.assertTrue(np.alltrue(recarr['i8col'][:] == orig_val), + "Stored and retrieved values do not match.") + + # Time32 column. + orig_val = np.arange(nrows, dtype=np.int32) + if common.verbose: + print("Original values:", orig_val) + print("Retrieved values:", recarr['t32col'][:]) + self.assertTrue(np.alltrue(recarr['t32col'][:] == orig_val), + "Stored and retrieved values do not match.") + + # Time64 column. + orig_val = np.arange(0, nrows * 2, dtype=np.int32) + 0.012 + orig_val.shape = (nrows, 2) + if common.verbose: + print("Original values:", orig_val) + print("Retrieved values:", recarr['t64col'][:]) + self.assertTrue(common.allequal( + recarr['t64col'][:], orig_val, np.float64), + "Stored and retrieved values do not match.") + + +class BigEndianTestCase(common.PyTablesTestCase): + """Tests for reading big-endian time values in arrays and nested tables.""" + + def setUp(self): + super().setUp() + filename = common.test_filename('times-nested-be.h5') + self.h5file = tb.open_file(filename, 'r') + + def tearDown(self): + self.h5file.close() + super().tearDown() + + def test00a_Read32Array(self): + """Checking Time32 type in arrays.""" + + # Check the written data. + earr = self.h5file.root.earr32[:] + + # Generate the expected Time32 array. + start = 1_178_896_298 + nrows = 10 + orig_val = np.arange(start, start + nrows, dtype=np.int32) + + if common.verbose: + print("Retrieved values:", earr) + print("Should look like:", orig_val) + self.assertTrue(np.alltrue(earr == orig_val), + "Retrieved values do not match the expected values.") + + def test00b_Read64Array(self): + """Checking Time64 type in arrays.""" + + # Check the written data. + earr = self.h5file.root.earr64[:] + + # Generate the expected Time64 array. + start = 1_178_896_298.832258 + nrows = 10 + orig_val = np.arange(start, start + nrows, dtype=np.float64) + + if common.verbose: + print("Retrieved values:", earr) + print("Should look like:", orig_val) + self.assertTrue(np.allclose(earr, orig_val, rtol=1.e-15), + "Retrieved values do not match the expected values.") + + def test01a_ReadPlainColumn(self): + """Checking Time32 type in plain columns.""" + + # Check the written data. + tbl = self.h5file.root.tbl + t32 = tbl.cols.t32[:] + + # Generate the expected Time32 array. + start = 1_178_896_298 + nrows = 10 + orig_val = np.arange(start, start + nrows, dtype=np.int32) + + if common.verbose: + print("Retrieved values:", t32) + print("Should look like:", orig_val) + self.assertTrue(np.alltrue(t32 == orig_val), + "Retrieved values do not match the expected values.") + + def test01b_ReadNestedColumn(self): + """Checking Time64 type in nested columns.""" + + # Check the written data. + tbl = self.h5file.root.tbl + t64 = tbl.cols.nested.t64[:] + + # Generate the expected Time64 array. + start = 1_178_896_298.832258 + nrows = 10 + orig_val = np.arange(start, start + nrows, dtype=np.float64) + + if common.verbose: + print("Retrieved values:", t64) + print("Should look like:", orig_val) + self.assertTrue(np.allclose(t64, orig_val, rtol=1.e-15), + "Retrieved values do not match the expected values.") + + def test02_ReadNestedColumnTwice(self): + """Checking Time64 type in nested columns (read twice).""" + + # Check the written data. + tbl = self.h5file.root.tbl + dummy = tbl.cols.nested.t64[:] + self.assertIsNotNone(dummy) + t64 = tbl.cols.nested.t64[:] + + # Generate the expected Time64 array. + start = 1_178_896_298.832258 + nrows = 10 + orig_val = np.arange(start, start + nrows, dtype=np.float64) + + if common.verbose: + print("Retrieved values:", t64) + print("Should look like:", orig_val) + self.assertTrue(np.allclose(t64, orig_val, rtol=1.e-15), + "Retrieved values do not match the expected values.") + + +def suite(): + """suite() -> test suite + + Returns a test suite consisting of all the test cases in the module. + """ + + theSuite = common.unittest.TestSuite() + + theSuite.addTest(common.unittest.makeSuite(LeafCreationTestCase)) + theSuite.addTest(common.unittest.makeSuite(OpenTestCase)) + theSuite.addTest(common.unittest.makeSuite(CompareTestCase)) + theSuite.addTest(common.unittest.makeSuite(UnalignedTestCase)) + theSuite.addTest(common.unittest.makeSuite(BigEndianTestCase)) + + return theSuite + + +if __name__ == '__main__': + import sys + common.parse_argv(sys.argv) + common.print_versions() + common.unittest.main(defaultTest='suite') diff --git a/tables/tests/test_tree.py b/tables/tests/test_tree.py new file mode 100644 index 0000000..3cc8127 --- /dev/null +++ b/tables/tests/test_tree.py @@ -0,0 +1,1107 @@ +import sys +import tempfile +import warnings +from pathlib import Path +from time import perf_counter as clock + +import tables as tb +from tables.tests import common + + +# Test Record class +class Record(tb.IsDescription): + var1 = tb.StringCol(itemsize=4) # 4-character String + var2 = tb.IntCol() # integer + var3 = tb.Int16Col() # short integer + var4 = tb.FloatCol() # double (double-precision) + var5 = tb.Float32Col() # float (single-precision) + + +class TreeTestCase(common.TempFileMixin, common.PyTablesTestCase): + open_mode = "w" + title = "This is the table title" + expectedrows = 10 + appendrows = 5 + + def setUp(self): + super().setUp() + + # Create an instance of HDF5 Table + self.populateFile() + self.h5file.close() + + def populateFile(self): + group = self.h5file.root + maxshort = 1 << 15 + # maxint = 2147483647 # (2 ** 31 - 1) + for j in range(3): + # Create a table + table = self.h5file.create_table(group, 'table'+str(j), Record, + title=self.title, + filters=None, + expectedrows=self.expectedrows) + # Get the record object associated with the new table + d = table.row + # Fill the table + for i in range(self.expectedrows): + d['var1'] = '%04d' % (self.expectedrows - i) + d['var2'] = i + d['var3'] = i % maxshort + d['var4'] = float(i) + d['var5'] = float(i) + d.append() # This injects the Record values + # Flush the buffer for this table + table.flush() + + # Create a couple of arrays in each group + var1List = [x['var1'] for x in table.iterrows()] + var4List = [x['var4'] for x in table.iterrows()] + + self.h5file.create_array(group, 'var1', var1List, "1") + self.h5file.create_array(group, 'var4', var4List, "4") + + # Create a new group (descendant of group) + group2 = self.h5file.create_group(group, 'group'+str(j)) + # Iterate over this new group (group2) + group = group2 + + def test00_getNode(self): + """Checking the File.get_node() with string node names""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test00_getNode..." % self.__class__.__name__) + + self.h5file = tb.open_file(self.h5fname, "r") + nodelist = ['/', '/table0', '/group0/var1', '/group0/group1/var4'] + nodenames = [] + for node in nodelist: + object = self.h5file.get_node(node) + nodenames.append(object._v_pathname) + + self.assertEqual(nodenames, nodelist) + if common.verbose: + print("get_node(pathname) test passed") + nodegroups = [ + '/', '/group0', '/group0/group1', '/group0/group1/group2'] + nodenames = ['var1', 'var4'] + nodepaths = [] + for group in nodegroups: + for name in nodenames: + try: + object = self.h5file.get_node(group, name) + except LookupError: + pass + else: + nodepaths.append(object._v_pathname) + + self.assertEqual(nodepaths, + ['/var1', '/var4', + '/group0/var1', '/group0/var4', + '/group0/group1/var1', '/group0/group1/var4']) + + if common.verbose: + print("get_node(groupname, name) test passed") + nodelist = ['/', '/group0', '/group0/group1', '/group0/group1/group2', + '/table0'] + nodenames = [] + groupobjects = [] + # warnings.filterwarnings("error", category=UserWarning) + for node in nodelist: + try: + object = self.h5file.get_node(node, classname='Group') + except LookupError: + if common.verbose: + (type, value, traceback) = sys.exc_info() + print("\nGreat!, the next LookupError was catched!") + print(value) + else: + nodenames.append(object._v_pathname) + groupobjects.append(object) + + self.assertEqual(nodenames, + ['/', '/group0', '/group0/group1', + '/group0/group1/group2']) + if common.verbose: + print("get_node(groupname, classname='Group') test passed") + + # Reset the warning + # warnings.filterwarnings("default", category=UserWarning) + + nodenames = ['var1', 'var4'] + nodearrays = [] + for group in groupobjects: + for name in nodenames: + try: + object = self.h5file.get_node(group, name, 'Array') + except Exception: + pass + else: + nodearrays.append(object._v_pathname) + + self.assertEqual(nodearrays, + ['/var1', '/var4', + '/group0/var1', '/group0/var4', + '/group0/group1/var1', '/group0/group1/var4']) + if common.verbose: + print("get_node(groupobject, name, classname='Array') test passed") + + def test01_getNodeClass(self): + """Checking the File.get_node() with instances""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test01_getNodeClass..." % + self.__class__.__name__) + + self.h5file = tb.open_file(self.h5fname, "r") + + # This tree ways of get_node usage should return a table instance + table = self.h5file.get_node("/group0/table1") + self.assertIsInstance(table, tb.Table) + table = self.h5file.get_node("/group0", "table1") + self.assertIsInstance(table, tb.Table) + table = self.h5file.get_node(self.h5file.root.group0, "table1") + self.assertIsInstance(table, tb.Table) + + # This should return an array instance + arr = self.h5file.get_node("/group0/var1") + self.assertIsInstance(arr, tb.Array) + self.assertIsInstance(arr, tb.Leaf) + + # And this a Group + group = self.h5file.get_node("/group0", "group1", "Group") + self.assertIsInstance(group, tb.Group) + + def test02_listNodes(self): + """Checking the File.list_nodes() method""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test02_listNodes..." % self.__class__.__name__) + + # Made the warnings to raise an error + # warnings.filterwarnings("error", category=UserWarning) + self.h5file = tb.open_file(self.h5fname, "r") + + self.assertRaises(TypeError, + self.h5file.list_nodes, '/', 'NoSuchClass') + + nodelist = ['/', '/group0', '/group0/table1', '/group0/group1/group2', + '/var1'] + nodenames = [] + objects = [] + for node in nodelist: + try: + objectlist = self.h5file.list_nodes(node) + except Exception: + pass + else: + objects.extend(objectlist) + for object in objectlist: + nodenames.append(object._v_pathname) + + self.assertEqual(nodenames, + ['/group0', '/table0', '/var1', '/var4', + '/group0/group1', '/group0/table1', + '/group0/var1', '/group0/var4']) + if common.verbose: + print("list_nodes(pathname) test passed") + + nodenames = [] + for node in objects: + try: + objectlist = self.h5file.list_nodes(node) + except Exception: + pass + else: + for object in objectlist: + nodenames.append(object._v_pathname) + + self.assertEqual(nodenames, + ['/group0/group1', '/group0/table1', + '/group0/var1', '/group0/var4', + '/group0/group1/group2', '/group0/group1/table2', + '/group0/group1/var1', '/group0/group1/var4']) + + if common.verbose: + print("list_nodes(groupobject) test passed") + + nodenames = [] + for node in objects: + try: + objectlist = self.h5file.list_nodes(node, 'Leaf') + except TypeError: + if common.verbose: + (type, value, traceback) = sys.exc_info() + print("\nGreat!, the next TypeError was catched!") + print(value) + else: + for object in objectlist: + nodenames.append(object._v_pathname) + + self.assertEqual(nodenames, + ['/group0/table1', + '/group0/var1', '/group0/var4', + '/group0/group1/table2', + '/group0/group1/var1', '/group0/group1/var4']) + + if common.verbose: + print("list_nodes(groupobject, classname = 'Leaf') test passed") + + nodenames = [] + for node in objects: + try: + objectlist = self.h5file.list_nodes(node, 'Table') + except TypeError: + if common.verbose: + (type, value, traceback) = sys.exc_info() + print("\nGreat!, the next TypeError was catched!") + print(value) + else: + for object in objectlist: + nodenames.append(object._v_pathname) + + self.assertEqual(nodenames, + ['/group0/table1', '/group0/group1/table2']) + + if common.verbose: + print("list_nodes(groupobject, classname = 'Table') test passed") + + # Reset the warning + # warnings.filterwarnings("default", category=UserWarning) + + def test02b_iterNodes(self): + """Checking the File.iter_nodes() method""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test02b_iterNodes..." % self.__class__.__name__) + + self.h5file = tb.open_file(self.h5fname, "r") + + self.assertRaises(TypeError, + self.h5file.list_nodes, '/', 'NoSuchClass') + + nodelist = ['/', '/group0', '/group0/table1', '/group0/group1/group2', + '/var1'] + nodenames = [] + objects = [] + for node in nodelist: + try: + objectlist = [o for o in self.h5file.iter_nodes(node)] + except Exception: + pass + else: + objects.extend(objectlist) + for object in objectlist: + nodenames.append(object._v_pathname) + + self.assertEqual(nodenames, + ['/group0', '/table0', '/var1', '/var4', + '/group0/group1', '/group0/table1', + '/group0/var1', '/group0/var4']) + if common.verbose: + print("iter_nodes(pathname) test passed") + + nodenames = [] + for node in objects: + try: + objectlist = [o for o in self.h5file.iter_nodes(node)] + except Exception: + pass + else: + for object in objectlist: + nodenames.append(object._v_pathname) + + self.assertEqual(nodenames, + ['/group0/group1', '/group0/table1', + '/group0/var1', '/group0/var4', + '/group0/group1/group2', '/group0/group1/table2', + '/group0/group1/var1', '/group0/group1/var4']) + + if common.verbose: + print("iter_nodes(groupobject) test passed") + + nodenames = [] + for node in objects: + try: + objectlist = [o for o in self.h5file.iter_nodes(node, 'Leaf')] + except TypeError: + if common.verbose: + (type, value, traceback) = sys.exc_info() + print("\nGreat!, the next TypeError was catched!") + print(value) + else: + for object in objectlist: + nodenames.append(object._v_pathname) + + self.assertEqual(nodenames, + ['/group0/table1', + '/group0/var1', '/group0/var4', + '/group0/group1/table2', + '/group0/group1/var1', '/group0/group1/var4']) + + if common.verbose: + print("iter_nodes(groupobject, classname = 'Leaf') test passed") + + nodenames = [] + for node in objects: + try: + objectlist = [o for o in self.h5file.iter_nodes(node, 'Table')] + except TypeError: + if common.verbose: + (type, value, traceback) = sys.exc_info() + print("\nGreat!, the next TypeError was catched!") + print(value) + else: + for object in objectlist: + nodenames.append(object._v_pathname) + + self.assertEqual(nodenames, + ['/group0/table1', '/group0/group1/table2']) + + if common.verbose: + print("iter_nodes(groupobject, classname = 'Table') test passed") + + # Reset the warning + # warnings.filterwarnings("default", category=UserWarning) + + def test03_TraverseTree(self): + """Checking the File.walk_groups() method""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test03_TraverseTree..." % + self.__class__.__name__) + + self.h5file = tb.open_file(self.h5fname, "r") + groups = [] + tables_ = [] + arrays = [] + for group in self.h5file.walk_groups(): + groups.append(group._v_pathname) + for table in self.h5file.list_nodes(group, 'Table'): + tables_.append(table._v_pathname) + for arr in self.h5file.list_nodes(group, 'Array'): + arrays.append(arr._v_pathname) + + self.assertEqual(groups, + ["/", "/group0", "/group0/group1", + "/group0/group1/group2"]) + + self.assertEqual( + tables_, + ["/table0", "/group0/table1", "/group0/group1/table2"]) + + self.assertEqual(arrays, + ['/var1', '/var4', + '/group0/var1', '/group0/var4', + '/group0/group1/var1', '/group0/group1/var4']) + if common.verbose: + print("walk_groups() test passed") + + groups = [] + tables_ = [] + arrays = [] + for group in self.h5file.walk_groups("/group0/group1"): + groups.append(group._v_pathname) + for table in self.h5file.list_nodes(group, 'Table'): + tables_.append(table._v_pathname) + for arr in self.h5file.list_nodes(group, 'Array'): + arrays.append(arr._v_pathname) + + self.assertEqual(groups, + ["/group0/group1", "/group0/group1/group2"]) + + self.assertEqual(tables_, ["/group0/group1/table2"]) + + self.assertEqual(arrays, [ + '/group0/group1/var1', '/group0/group1/var4']) + + if common.verbose: + print("walk_groups(pathname) test passed") + + def test04_walkNodes(self): + """Checking File.walk_nodes""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test04_walkNodes..." % self.__class__.__name__) + + self.h5file = tb.open_file(self.h5fname, "r") + + self.assertRaises(TypeError, next, + self.h5file.walk_nodes('/', 'NoSuchClass')) + + groups = [] + tables1 = [] + tables2 = [] + arrays = [] + for group in self.h5file.walk_nodes(classname="Group"): + groups.append(group._v_pathname) + for table in group._f_iter_nodes(classname='Table'): + tables1.append(table._v_pathname) + + # Test the recursivity + for table in self.h5file.root._f_walknodes('Table'): + tables2.append(table._v_pathname) + + for arr in self.h5file.walk_nodes(classname='Array'): + arrays.append(arr._v_pathname) + + self.assertEqual(groups, + ["/", "/group0", "/group0/group1", + "/group0/group1/group2"]) + self.assertEqual(tables1, + ["/table0", "/group0/table1", + "/group0/group1/table2"]) + self.assertEqual(tables2, + ["/table0", "/group0/table1", + "/group0/group1/table2"]) + self.assertEqual(arrays, + ['/var1', '/var4', + '/group0/var1', '/group0/var4', + '/group0/group1/var1', '/group0/group1/var4']) + + if common.verbose: + print("File.__iter__() and Group.__iter__ test passed") + + groups = [] + tables_ = [] + arrays = [] + for group in self.h5file.walk_nodes("/group0/group1", + classname="Group"): + groups.append(group._v_pathname) + for table in group._f_walknodes('Table'): + tables_.append(table._v_pathname) + for arr in self.h5file.walk_nodes(group, 'Array'): + arrays.append(arr._v_pathname) + + self.assertEqual(groups, + ["/group0/group1", "/group0/group1/group2"]) + + self.assertEqual(tables_, ["/group0/group1/table2"]) + + self.assertEqual(arrays, [ + '/group0/group1/var1', '/group0/group1/var4']) + + if common.verbose: + print("walk_nodes(pathname, classname) test passed") + + def test05_dir(self): + """Checking Group.__dir__""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test05_dir..." % self.__class__.__name__) + + self.h5file = tb.open_file(self.h5fname, "r") + + """ + h5file nodes: + '/table0', '/var1', '/var4' + '/group0/table1', '/group0/var1', '/group0/var4', + '/group0/group1/table2', '/group0/group1/var1', '/group0/group1/var4' + """ + root_dir = dir(self.h5file.root) + + # Check some regular attributes. + + self.assertIn('_v_children', root_dir) + self.assertIn('_v_attrs', root_dir) + self.assertIn('_v_groups', root_dir) + self.assertIn('_g_get_child_group_class', root_dir) + self.assertIn('_g_get_child_group_class', root_dir) + self.assertIn('_f_close', root_dir) + + # Check children nodes. + + self.assertIn('group0', root_dir) + self.assertIn('table0', root_dir) + self.assertIn('var1', root_dir) + self.assertNotIn('table1', root_dir) + self.assertNotIn('table2', root_dir) + self.assertSequenceEqual(sorted(set(root_dir)), + sorted(root_dir)) # Check for no duplicates. + + root_group0_dir = dir(self.h5file.root.group0) + self.assertIn('group1', root_group0_dir) + self.assertIn('table1', root_group0_dir) + self.assertNotIn('table0', root_group0_dir) + self.assertNotIn('table2', root_group0_dir) + self.assertSequenceEqual(sorted(set(root_group0_dir)), + sorted(root_group0_dir)) + + root_group0_group1_dir = dir(self.h5file.root.group0.group1) + self.assertIn('group2', root_group0_group1_dir) + self.assertIn('table2', root_group0_group1_dir) + self.assertNotIn('table0', root_group0_group1_dir) + self.assertNotIn('table1', root_group0_group1_dir) + self.assertNotIn('group0', root_group0_group1_dir) + self.assertNotIn('group1', root_group0_group1_dir) + self.assertSequenceEqual(sorted(set(root_group0_group1_dir)), + sorted(root_group0_group1_dir)) + + if common.verbose: + print("Group.__dir__ test passed") + + def test06_v_groups(self): + """Checking Group._v_groups""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test06_v_groups..." % self.__class__.__name__) + + self.h5file = tb.open_file(self.h5fname, "r") + + """ + h5file nodes: + '/table0', '/var1', '/var4' + '/group0/table1', '/group0/var1', '/group0/var4', + '/group0/group1/table2', '/group0/group1/var1', '/group0/group1/var4' + """ + self.assertIsInstance(self.h5file.root._v_groups, dict) + group_names = {'group0'} + names = {k for k, v in self.h5file.root._v_groups.iteritems()} + self.assertEqual(group_names, names) + groups = list(self.h5file.root._v_groups.itervalues()) + self.assertEqual(len(groups), len(group_names)) + + for group in groups: + with self.subTest(name=group._v_name): + self.assertIn(group._v_name, group_names) + + if common.verbose: + print("Group._v_groups test passed") + + +class DeepTreeTestCase(common.TempFileMixin, common.PyTablesTestCase): + """Checks for deep hierarchy levels in PyTables trees.""" + + def setUp(self): + super().setUp() + + # Here we put a more conservative limit to deal with more platforms + # With maxdepth = 64 this test would take less than 40 MB + # of main memory to run, which is quite reasonable nowadays. + # With maxdepth = 1024 this test will take around 300 MB. + if common.heavy: + self.maxdepth = 256 # Takes around 60 MB of memory! + else: + self.maxdepth = 64 # This should be safe for most machines + if common.verbose: + print("Maximum depth tested :", self.maxdepth) + + # Open a new empty HDF5 file + group = self.h5file.root + if common.verbose: + print("Depth writing progress: ", end=' ') + + # Iterate until maxdepth + for depth in range(self.maxdepth): + # Save it on the HDF5 file + if common.verbose: + print("%3d," % (depth), end=' ') + # Create a couple of arrays here + self.h5file.create_array( + group, 'array', [1, 1], "depth: %d" % depth) + self.h5file.create_array( + group, 'array2', [1, 1], "depth: %d" % depth) + # And also a group + self.h5file.create_group(group, 'group2_' + str(depth)) + # Finally, iterate over a new group + group = self.h5file.create_group(group, 'group' + str(depth)) + + # Close the file + self.h5file.close() + + def _check_tree(self, filename): + # Open the previous HDF5 file in read-only mode + + with tb.open_file(filename, mode="r") as h5file: + group = h5file.root + if common.verbose: + print("\nDepth reading progress: ", end=' ') + + # Get the metadata on the previosly saved arrays + for depth in range(self.maxdepth): + if common.verbose: + print("%3d," % (depth), end=' ') + + # Check the contents + self.assertEqual(group.array[:], [1, 1]) + self.assertIn("array2", group) + self.assertIn("group2_"+str(depth), group) + + # Iterate over the next group + group = h5file.get_node(group, 'group' + str(depth)) + + if common.verbose: + print() # This flush the stdout buffer + + def test00_deepTree(self): + """Creation of a large depth object tree.""" + + self._check_tree(self.h5fname) + + def test01a_copyDeepTree(self): + """Copy of a large depth object tree.""" + + self.h5file = tb.open_file(self.h5fname, mode="r") + h5fname2 = tempfile.mktemp(".h5") + try: + with tb.open_file(h5fname2, mode="w") as h5file2: + if common.verbose: + print("\nCopying deep tree...") + + self.h5file.copy_node(self.h5file.root, h5file2.root, + recursive=True) + self.h5file.close() + + self._check_tree(h5fname2) + finally: + if Path(h5fname2).is_file(): + Path(h5fname2).unlink() + + def test01b_copyDeepTree(self): + """Copy of a large depth object tree with small node cache.""" + + self.h5file = tb.open_file(self.h5fname, mode="r", node_cache_slots=10) + h5fname2 = tempfile.mktemp(".h5") + try: + with tb.open_file(h5fname2, mode="w", + node_cache_slots=10) as h5file2: + if common.verbose: + print("\nCopying deep tree...") + + self.h5file.copy_node(self.h5file.root, h5file2.root, + recursive=True) + self.h5file.close() + + self._check_tree(h5fname2) + finally: + if Path(h5fname2).is_file(): + Path(h5fname2).unlink() + + def test01c_copyDeepTree(self): + """Copy of a large depth object tree with no node cache.""" + + self.h5file = tb.open_file(self.h5fname, mode="r", node_cache_slots=0) + h5fname2 = tempfile.mktemp(".h5") + try: + with tb.open_file(h5fname2, mode="w", + node_cache_slots=0) as h5file2: + if common.verbose: + print("\nCopying deep tree...") + + self.h5file.copy_node(self.h5file.root, h5file2.root, + recursive=True) + self.h5file.close() + + self._check_tree(h5fname2) + finally: + if Path(h5fname2).is_file(): + Path(h5fname2).unlink() + + @common.unittest.skipUnless(common.heavy, 'only in heavy mode') + def test01d_copyDeepTree(self): + """Copy of a large depth object tree with static node cache.""" + + self.h5file = tb.open_file(self.h5fname, mode="r", + node_cache_slots=-256) + h5fname2 = tempfile.mktemp(".h5") + try: + with tb.open_file(h5fname2, mode="w", + node_cache_slots=-256) as h5file2: + if common.verbose: + print("\nCopying deep tree...") + + self.h5file.copy_node(self.h5file.root, h5file2.root, + recursive=True) + self.h5file.close() + + self._check_tree(h5fname2) + finally: + if Path(h5fname2).is_file(): + Path(h5fname2).unlink() + + +class WideTreeTestCase(common.TempFileMixin, common.PyTablesTestCase): + """Checks for maximum number of children for a Group.""" + + def test00_Leafs(self): + """Checking creation of large number of leafs (1024) per group. + + Variable 'maxchildren' controls this check. PyTables support up + to 4096 children per group, but this would take too much memory + (up to 64 MB) for testing purposes (may be we can add a test for + big platforms). A 1024 children run takes up to 30 MB. A 512 + children test takes around 25 MB. + + """ + + if common.heavy: + maxchildren = 4096 + else: + maxchildren = 256 + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test00_wideTree..." % + self.__class__.__name__) + print("Maximum number of children tested :", maxchildren) + + a = [1, 1] + if common.verbose: + print("Children writing progress: ", end=' ') + for child in range(maxchildren): + if common.verbose: + print("%3d," % (child), end=' ') + self.h5file.create_array(self.h5file.root, 'array' + str(child), + a, "child: %d" % child) + if common.verbose: + print() + + t1 = clock() + a = [1, 1] + + # Open the previous HDF5 file in read-only mode + self._reopen() + if common.verbose: + print("\nTime spent opening a file with %d arrays: %s s" % + (maxchildren, clock()-t1)) + print("\nChildren reading progress: ", end=' ') + + # Get the metadata on the previosly saved arrays + for child in range(maxchildren): + if common.verbose: + print("%3d," % (child), end=' ') + + # Create an array for later comparison + # Get the actual array + array_ = getattr(self.h5file.root, 'array' + str(child)) + b = array_.read() + + # Arrays a and b must be equal + self.assertEqual(a, b) + if common.verbose: + print() # This flush the stdout buffer + + def test01_wideTree(self): + """Checking creation of large number of groups (1024) per group. + + Variable 'maxchildren' controls this check. PyTables support up + to 4096 children per group, but this would take too much memory + (up to 64 MB) for testing purposes (may be we can add a test for + big platforms). A 1024 children run takes up to 30 MB. A 512 + children test takes around 25 MB. + + """ + + if common.heavy: + # for big platforms! + maxchildren = 4096 + else: + # for standard platforms + maxchildren = 256 + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test00_wideTree..." % + self.__class__.__name__) + print("Maximum number of children tested :", maxchildren) + + if common.verbose: + print("Children writing progress: ", end=' ') + for child in range(maxchildren): + if common.verbose: + print("%3d," % (child), end=' ') + self.h5file.create_group(self.h5file.root, 'group' + str(child), + "child: %d" % child) + if common.verbose: + print() + + t1 = clock() + + # Open the previous HDF5 file in read-only mode + self._reopen() + if common.verbose: + print("\nTime spent opening a file with %d groups: %s s" % + (maxchildren, clock()-t1)) + print("\nChildren reading progress: ", end=' ') + + # Get the metadata on the previosly saved arrays + for child in range(maxchildren): + if common.verbose: + print("%3d," % (child), end=' ') + # Get the actual group + group = getattr(self.h5file.root, 'group' + str(child)) + # Arrays a and b must be equal + self.assertEqual(group._v_title, "child: %d" % child) + + if common.verbose: + print() # This flush the stdout buffer + + +class HiddenTreeTestCase(common.TempFileMixin, common.PyTablesTestCase): + """Check for hidden groups, leaves and hierarchies.""" + + def setUp(self): + super().setUp() + + self.visible = [] # list of visible object paths + self.hidden = [] # list of hidden object paths + + # Create some visible nodes: a, g, g/a1, g/a2, g/g, g/g/a. + h5f = self.h5file + h5f.create_array('/', 'a', [0]) + g = h5f.create_group('/', 'g') + h5f.create_array(g, 'a1', [0]) + h5f.create_array(g, 'a2', [0]) + g_g = h5f.create_group(g, 'g') + h5f.create_array(g_g, 'a', [0]) + + self.visible.extend(['/a', '/g', '/g/a1', '/g/a2', '/g/g', '/g/g/a']) + + # Create some hidden nodes: _p_a, _p_g, _p_g/a, _p_g/_p_a, g/_p_a. + h5f.create_array('/', '_p_a', [0]) + hg = h5f.create_group('/', '_p_g') + h5f.create_array(hg, 'a', [0]) + h5f.create_array(hg, '_p_a', [0]) + h5f.create_array(g, '_p_a', [0]) + + self.hidden.extend( + ['/_p_a', '/_p_g', '/_p_g/a', '/_p_g/_p_a', '/g/_p_a']) + + # The test behind commented out because the .objects dictionary + # has been removed (as well as .leaves and .groups) + def _test00_objects(self): + """Absence of hidden nodes in `File.objects`.""" + + objects = self.h5file.objects + + warnings.filterwarnings('ignore', category=DeprecationWarning) + + for vpath in self.visible: + self.assertTrue( + vpath in objects, + "Missing visible node ``%s`` from ``File.objects``." % vpath) + for hpath in self.hidden: + self.assertTrue( + hpath not in objects, + "Found hidden node ``%s`` in ``File.objects``." % hpath) + + warnings.filterwarnings('default', category=DeprecationWarning) + + # The test behind commented out because the .objects dictionary + # has been removed (as well as .leaves and .groups) + def _test00b_objects(self): + """Object dictionaries conformance with ``walk_nodes()``.""" + + def dictCheck(dictName, classname): + file_ = self.h5file + + objects = getattr(file_, dictName) + walkPaths = [node._v_pathname + for node in file_.walk_nodes('/', classname)] + dictPaths = [path for path in objects] + walkPaths.sort() + dictPaths.sort() + self.assertEqual( + walkPaths, dictPaths, + "nodes in ``%s`` do not match those from ``walk_nodes()``" + % dictName) + self.assertEqual( + len(walkPaths), len(objects), + "length of ``%s`` differs from that of ``walk_nodes()``" + % dictName) + + warnings.filterwarnings('ignore', category=DeprecationWarning) + + dictCheck('objects', None) + dictCheck('groups', 'Group') + dictCheck('leaves', 'Leaf') + + warnings.filterwarnings('default', category=DeprecationWarning) + + def test01_getNode(self): + """Node availability via `File.get_node()`.""" + + h5f = self.h5file + + for vpath in self.visible: + h5f.get_node(vpath) + for hpath in self.hidden: + h5f.get_node(hpath) + + def test02_walkGroups(self): + """Hidden group absence in `File.walk_groups()`.""" + + hidden = self.hidden + + for group in self.h5file.walk_groups('/'): + pathname = group._v_pathname + self.assertNotIn(pathname, hidden, + f"Walked across hidden group ``{pathname}``.") + + def test03_walkNodes(self): + """Hidden node absence in `File.walk_nodes()`.""" + + hidden = self.hidden + + for node in self.h5file.walk_nodes('/'): + pathname = node._v_pathname + self.assertNotIn(pathname, hidden, + f"Walked across hidden node ``{pathname}``.") + + def test04_listNodesVisible(self): + """Listing visible nodes under a visible group (list_nodes).""" + + hidden = self.hidden + + for node in self.h5file.list_nodes('/g'): + pathname = node._v_pathname + self.assertNotIn(pathname, hidden, + f"Listed hidden node ``{pathname}``.") + + def test04b_listNodesVisible(self): + """Listing visible nodes under a visible group (iter_nodes).""" + + hidden = self.hidden + + for node in self.h5file.iter_nodes('/g'): + pathname = node._v_pathname + self.assertNotIn(pathname, hidden, + f"Listed hidden node ``{pathname}``.") + + def test05_listNodesHidden(self): + """Listing visible nodes under a hidden group (list_nodes).""" + + hidden = self.hidden + + node_to_find = '/_p_g/a' + found_node = False + for node in self.h5file.list_nodes('/_p_g'): + pathname = node._v_pathname + if pathname == node_to_find: + found_node = True + self.assertIn(pathname, hidden, + f"Listed hidden node ``{pathname}``.") + + self.assertTrue(found_node, + "Hidden node ``%s`` was not listed." % node_to_find) + + def test05b_iterNodesHidden(self): + """Listing visible nodes under a hidden group (iter_nodes).""" + + hidden = self.hidden + + node_to_find = '/_p_g/a' + found_node = False + for node in self.h5file.iter_nodes('/_p_g'): + pathname = node._v_pathname + if pathname == node_to_find: + found_node = True + self.assertIn(pathname, hidden, + f"Listed hidden node ``{pathname}``.") + + self.assertTrue(found_node, + "Hidden node ``%s`` was not listed." % node_to_find) + + # The test behind commented out because the .objects dictionary + # has been removed (as well as .leaves and .groups) + def _test06_reopen(self): + """Reopening a file with hidden nodes.""" + + self.h5file.close() + self.h5file = tb.open_file(self.h5fname) + self.test00_objects() + + def test07_move(self): + """Moving a node between hidden and visible groups.""" + + is_visible_node = self.h5file.is_visible_node + + self.assertFalse(is_visible_node('/_p_g/a')) + self.h5file.move_node('/_p_g/a', '/g', 'a') + self.assertTrue(is_visible_node('/g/a')) + self.h5file.move_node('/g/a', '/_p_g', 'a') + self.assertFalse(is_visible_node('/_p_g/a')) + + def test08_remove(self): + """Removing a visible group with hidden children.""" + + self.assertIn('/g/_p_a', self.h5file) + self.h5file.root.g._f_remove(recursive=True) + self.assertNotIn('/g/_p_a', self.h5file) + + +class CreateParentsTestCase(common.TempFileMixin, common.PyTablesTestCase): + """Test the ``createparents`` flag. + + These are mainly for the user interface. More thorough tests on the + workings of the flag can be found in the ``test_do_undo.py`` module. + + """ + + filters = tb.Filters(complevel=4) # simply non-default + + def setUp(self): + super().setUp() + self.h5file.create_array('/', 'array', [1]) + self.h5file.create_group('/', 'group', filters=self.filters) + + def test00_parentType(self): + """Using the right type of parent node argument.""" + + h5file, root = self.h5file, self.h5file.root + + self.assertRaises(TypeError, h5file.create_array, + root.group, 'arr', [1], createparents=True) + self.assertRaises(TypeError, h5file.copy_node, + '/array', root.group, createparents=True) + self.assertRaises(TypeError, h5file.move_node, + '/array', root.group, createparents=True) + self.assertRaises(TypeError, h5file.copy_children, + '/group', root, createparents=True) + + def test01_inside(self): + """Placing a node inside a nonexistent child of itself.""" + self.assertRaises(tb.NodeError, self.h5file.move_node, + '/group', '/group/foo/bar', + createparents=True) + self.assertNotIn('/group/foo', self.h5file) + self.assertRaises(tb.NodeError, self.h5file.copy_node, + '/group', '/group/foo/bar', + recursive=True, createparents=True) + self.assertNotIn('/group/foo', self.h5fname) + + def test02_filters(self): + """Propagating the filters of created parent groups.""" + + self.h5file.create_group('/group/foo/bar', 'baz', createparents=True) + self.assertIn('/group/foo/bar/baz', self.h5file) + for group in self.h5file.walk_groups('/group'): + self.assertEqual(self.filters, group._v_filters) + + +def suite(): + theSuite = common.unittest.TestSuite() + # This counter is useful when detecting memory leaks + niter = 1 + + for i in range(niter): + theSuite.addTest(common.unittest.makeSuite(TreeTestCase)) + theSuite.addTest(common.unittest.makeSuite(DeepTreeTestCase)) + theSuite.addTest(common.unittest.makeSuite(WideTreeTestCase)) + theSuite.addTest(common.unittest.makeSuite(HiddenTreeTestCase)) + theSuite.addTest(common.unittest.makeSuite(CreateParentsTestCase)) + + return theSuite + + +if __name__ == '__main__': + common.parse_argv(sys.argv) + common.print_versions() + common.unittest.main(defaultTest='suite') diff --git a/tables/tests/test_types.py b/tables/tests/test_types.py new file mode 100644 index 0000000..b9d7d09 --- /dev/null +++ b/tables/tests/test_types.py @@ -0,0 +1,325 @@ +import sys + +import numpy as np + +import tables as tb +from tables.tests import common + + +# Test Record class +class Record(tb.IsDescription): + var1 = tb.StringCol(itemsize=4) # 4-character String + var2 = tb.Col.from_kind('int') # integer + var3 = tb.Col.from_kind('int', itemsize=2) # short integer + var4 = tb.Col.from_kind('float') # double (double-precision) + var5 = tb.Col.from_kind('float', itemsize=4) # float (single-precision) + var6 = tb.Col.from_kind('complex') # double-precision + var7 = tb.Col.from_kind('complex', itemsize=8) # single-precision + if hasattr(tb, "Float16Atom"): + var8 = tb.Col.from_kind('float', itemsize=2) # half-precision + if hasattr(tb, "Float96Atom"): + var9 = tb.Col.from_kind('float', itemsize=12) # extended-precision + if hasattr(tb, "Float128Atom"): + var10 = tb.Col.from_kind('float', itemsize=16) # extended-precision + if hasattr(tb, "Complex192Atom"): + var11 = tb.Col.from_kind('complex', itemsize=24) # extended-precision + if hasattr(tb, "Complex256Atom"): + var12 = tb.Col.from_kind('complex', itemsize=32) # extended-precision + + +class RangeTestCase(common.TempFileMixin, common.PyTablesTestCase): + title = "This is the table title" + expectedrows = 100 + maxshort = 2 ** 15 + maxint = 2_147_483_648 # (2 ** 31) + compress = 0 + + def setUp(self): + super().setUp() + self.rootgroup = self.h5file.root + + # Create a table + self.table = self.h5file.create_table(self.rootgroup, 'table', + Record, self.title) + + def test00_range(self): + """Testing the range check.""" + + rec = self.table.row + + # Save a record + i = self.maxshort + rec['var1'] = '%04d' % (i) + rec['var2'] = i + rec['var3'] = i + rec['var4'] = float(i) + rec['var5'] = float(i) + rec['var6'] = float(i) + rec['var7'] = complex(i, i) + if hasattr(tb, "Float16Atom"): + rec['var8'] = float(i) + if hasattr(tb, "Float96Atom"): + rec['var9'] = float(i) + if hasattr(tb, "Float128Atom"): + rec['var10'] = float(i) + try: + rec.append() + except ValueError: + if common.verbose: + (type, value, traceback) = sys.exc_info() + print("\nGreat!, the next ValueError was catched!") + print(value) + pass + else: + if common.verbose: + print( + "\nNow, the range overflow no longer issues a ValueError") + + def test01_type(self): + """Testing the type check.""" + + rec = self.table.row + # Save a record + i = self.maxshort + rec['var1'] = '%04d' % (i) + rec['var2'] = i + rec['var3'] = i % self.maxshort + rec['var5'] = float(i) + + with self.assertRaises(TypeError): + rec['var4'] = "124c" + + rec['var6'] = float(i) + rec['var7'] = complex(i, i) + if hasattr(tb, "Float16Atom"): + rec['var8'] = float(i) + if hasattr(tb, "Float96Atom"): + rec['var9'] = float(i) + if hasattr(tb, "Float128Atom"): + rec['var10'] = float(i) + + +# Check the dtype read-only attribute +class DtypeTestCase(common.TempFileMixin, common.PyTablesTestCase): + + def test00a_table(self): + """Check dtype accessor for Table objects.""" + + a = self.h5file.create_table('/', 'table', Record) + self.assertEqual(a.dtype, a.description._v_dtype) + + def test00b_column(self): + """Check dtype accessor for Column objects.""" + + a = self.h5file.create_table('/', 'table', Record) + c = a.cols.var3 + self.assertEqual(c.dtype, a.description._v_dtype['var3']) + + def test01_array(self): + """Check dtype accessor for Array objects.""" + + a = self.h5file.create_array('/', 'array', [1, 2]) + self.assertEqual(a.dtype, a.atom.dtype) + + def test02_carray(self): + """Check dtype accessor for CArray objects.""" + + a = self.h5file.create_carray('/', 'array', atom=tb.FloatAtom(), + shape=[1, 2]) + self.assertEqual(a.dtype, a.atom.dtype) + + def test03_carray(self): + """Check dtype accessor for EArray objects.""" + + a = self.h5file.create_earray('/', 'array', atom=tb.FloatAtom(), + shape=[0, 2]) + self.assertEqual(a.dtype, a.atom.dtype) + + def test04_vlarray(self): + """Check dtype accessor for VLArray objects.""" + + a = self.h5file.create_vlarray('/', 'array', tb.FloatAtom()) + self.assertEqual(a.dtype, a.atom.dtype) + + +class ReadFloatTestCase(common.TestFileMixin, common.PyTablesTestCase): + h5fname = common.test_filename("float.h5") + nrows = 5 + ncols = 6 + + def setUp(self): + super().setUp() + x = np.arange(self.ncols) + y = np.arange(self.nrows) + y.shape = (self.nrows, 1) + self.values = x + y + + def test01_read_float16(self): + dtype = "float16" + if hasattr(np, dtype): + ds = getattr(self.h5file.root, dtype) + self.assertNotIsInstance(ds, tb.UnImplemented) + self.assertEqual(ds.shape, (self.nrows, self.ncols)) + self.assertEqual(ds.dtype, dtype) + self.assertTrue(common.allequal( + ds.read(), self.values.astype(dtype))) + else: + with self.assertWarns(UserWarning): + ds = getattr(self.h5file.root, dtype) + self.assertIsInstance(ds, tb.UnImplemented) + + def test02_read_float32(self): + dtype = "float32" + ds = getattr(self.h5file.root, dtype) + self.assertNotIsInstance(ds, tb.UnImplemented) + self.assertEqual(ds.shape, (self.nrows, self.ncols)) + self.assertEqual(ds.dtype, dtype) + self.assertTrue(common.allequal( + ds.read(), self.values.astype(dtype))) + + def test03_read_float64(self): + dtype = "float64" + ds = getattr(self.h5file.root, dtype) + self.assertNotIsInstance(ds, tb.UnImplemented) + self.assertEqual(ds.shape, (self.nrows, self.ncols)) + self.assertEqual(ds.dtype, dtype) + self.assertTrue(common.allequal( + ds.read(), self.values.astype(dtype))) + + def test04_read_longdouble(self): + dtype = "longdouble" + if hasattr(tb, "Float96Atom") or hasattr(tb, "Float128Atom"): + ds = getattr(self.h5file.root, dtype) + self.assertNotIsInstance(ds, tb.UnImplemented) + self.assertEqual(ds.shape, (self.nrows, self.ncols)) + self.assertEqual(ds.dtype, dtype) + self.assertTrue(common.allequal( + ds.read(), self.values.astype(dtype))) + + if hasattr(tb, "Float96Atom"): + self.assertEqual(ds.dtype, "float96") + elif hasattr(tb, "Float128Atom"): + self.assertEqual(ds.dtype, "float128") + else: + # XXX: check + # the behavior depends on the HDF5 lib configuration + try: + with self.assertWarns(UserWarning): + ds = getattr(self.h5file.root, dtype) + self.assertIsInstance(ds, tb.UnImplemented) + except AssertionError: + if not tb.utilsextension._broken_hdf5_long_double(): + ds = getattr(self.h5file.root, dtype) + self.assertEqual(ds.dtype, "float64") + + def test05_read_quadprecision_float(self): + # XXX: check + try: + with self.assertWarns(UserWarning): + ds = self.h5file.root.quadprecision + self.assertIsInstance(ds, tb.UnImplemented) + except AssertionError: + # NOTE: it would be nice to have some sort of message that warns + # against the potential precision loss: the quad-precision + # dataset actually uses 128 bits for each element, not just + # 80 bits (longdouble) + ds = self.h5file.root.quadprecision + self.assertEqual(ds.dtype, "longdouble") + + +class AtomTestCase(common.PyTablesTestCase): + def test_init_parameters_01(self): + atom1 = tb.StringAtom(itemsize=12) + atom2 = atom1.copy() + self.assertEqual(atom1, atom2) + self.assertEqual(str(atom1), str(atom2)) + self.assertIsNot(atom1, atom2) + + def test_init_parameters_02(self): + atom1 = tb.StringAtom(itemsize=12) + atom2 = atom1.copy(itemsize=100, shape=(2, 2)) + self.assertEqual(atom2, + tb.StringAtom(itemsize=100, shape=(2, 2), dflt=b'')) + + def test_init_parameters_03(self): + atom1 = tb.StringAtom(itemsize=12) + self.assertRaises(TypeError, atom1.copy, foobar=42) + + def test_from_dtype_01(self): + atom1 = tb.Atom.from_dtype(np.dtype((np.int16, (2, 2)))) + atom2 = tb.Int16Atom(shape=(2, 2), dflt=0) + self.assertEqual(atom1, atom2) + self.assertEqual(str(atom1), str(atom2)) + + def test_from_dtype_02(self): + atom1 = tb.Atom.from_dtype(np.dtype('S5'), dflt=b'hello') + atom2 = tb.StringAtom(itemsize=5, shape=(), dflt=b'hello') + self.assertEqual(atom1, atom2) + self.assertEqual(str(atom1), str(atom2)) + + def test_from_dtype_03(self): + with self.assertWarns(Warning): + atom1 = tb.Atom.from_dtype(np.dtype('U5'), dflt=b'hello') + atom2 = tb.StringAtom(itemsize=5, shape=(), dflt=b'hello') + self.assertEqual(atom1, atom2) + self.assertEqual(str(atom1), str(atom2)) + + def test_from_dtype_04(self): + atom1 = tb.Atom.from_dtype(np.dtype('float64')) + atom2 = tb.Float64Atom(shape=(), dflt=0.0) + self.assertEqual(atom1, atom2) + self.assertEqual(str(atom1), str(atom2)) + + def test_from_kind_01(self): + atom1 = tb.Atom.from_kind('int', itemsize=2, shape=(2, 2)) + atom2 = tb.Int16Atom(shape=(2, 2), dflt=0) + self.assertEqual(atom1, atom2) + self.assertEqual(str(atom1), str(atom2)) + + def test_from_kind_02(self): + atom1 = tb.Atom.from_kind('int', shape=(2, 2)) + atom2 = tb.Int32Atom(shape=(2, 2), dflt=0) + self.assertEqual(atom1, atom2) + self.assertEqual(str(atom1), str(atom2)) + + def test_from_kind_03(self): + atom1 = tb.Atom.from_kind('int', shape=1) + atom2 = tb.Int32Atom(shape=(1,), dflt=0) + self.assertEqual(atom1, atom2) + self.assertEqual(str(atom1), str(atom2)) + + def test_from_kind_04(self): + atom1 = tb.Atom.from_kind('string', itemsize=5, dflt=b'hello') + atom2 = tb.StringAtom(itemsize=5, shape=(), dflt=b'hello') + self.assertEqual(atom1, atom2) + self.assertEqual(str(atom1), str(atom2)) + + def test_from_kind_05(self): + # ValueError: no default item size for kind ``string`` + self.assertRaises(ValueError, tb.Atom.from_kind, 'string', + dflt=b'hello') + + def test_from_kind_06(self): + # ValueError: unknown kind: 'Float' + self.assertRaises(ValueError, tb.Atom.from_kind, 'Float') + + +def suite(): + import doctest + + theSuite = common.unittest.TestSuite() + + for i in range(1): + theSuite.addTest(doctest.DocTestSuite(tb.atom)) + theSuite.addTest(common.unittest.makeSuite(AtomTestCase)) + theSuite.addTest(common.unittest.makeSuite(RangeTestCase)) + theSuite.addTest(common.unittest.makeSuite(DtypeTestCase)) + theSuite.addTest(common.unittest.makeSuite(ReadFloatTestCase)) + + return theSuite + + +if __name__ == '__main__': + common.parse_argv(sys.argv) + common.print_versions() + common.unittest.main(defaultTest='suite') diff --git a/tables/tests/test_utils.py b/tables/tests/test_utils.py new file mode 100644 index 0000000..3c541f6 --- /dev/null +++ b/tables/tests/test_utils.py @@ -0,0 +1,94 @@ +import sys +from io import StringIO + +from unittest.mock import patch + +import tables.scripts.ptrepack as ptrepack +import tables.scripts.ptdump as ptdump +import tables.scripts.pttree as pttree +from tables.tests import common + + +class ptrepackTestCase(common.PyTablesTestCase): + """Test ptrepack""" + + @patch.object(ptrepack, 'copy_leaf') + @patch.object(ptrepack.tb, 'open_file') + def test_paths_windows(self, mock_open_file, mock_copy_leaf): + """Checking handling of windows filenames: test gh-616""" + + # this filename has a semi-colon to check for + # regression of gh-616 + src_fn = 'D:\\window~1\\path\\000\\infile' + src_path = '/' + dst_fn = 'another\\path\\' + dst_path = '/path/in/outfile' + + argv = ['ptrepack', src_fn + ':' + src_path, dst_fn + ':' + dst_path] + with patch.object(sys, 'argv', argv): + ptrepack.main() + + args, kwargs = mock_open_file.call_args_list[0] + self.assertEqual(args, (src_fn, 'r')) + + args, kwargs = mock_copy_leaf.call_args_list[0] + self.assertEqual(args, (src_fn, dst_fn, src_path, dst_path)) + + +class ptdumpTestCase(common.PyTablesTestCase): + """Test ptdump""" + + @patch.object(ptdump.tb, 'open_file') + @patch('sys.stdout', new_callable=StringIO) + def test_paths_windows(self, _, mock_open_file): + """Checking handling of windows filenames: test gh-616""" + + # this filename has a semi-colon to check for + # regression of gh-616 (in ptdump) + src_fn = 'D:\\window~1\\path\\000\\ptdump' + src_path = '/' + + argv = ['ptdump', src_fn + ':' + src_path] + with patch.object(sys, 'argv', argv): + ptdump.main() + + args, kwargs = mock_open_file.call_args_list[0] + self.assertEqual(args, (src_fn, 'r')) + + +class pttreeTestCase(common.PyTablesTestCase): + """Test ptdump""" + + @patch.object(pttree.tb, 'open_file') + @patch.object(pttree, 'get_tree_str') + @patch('sys.stdout', new_callable=StringIO) + def test_paths_windows(self, _, mock_get_tree_str, mock_open_file): + """Checking handling of windows filenames: test gh-616""" + + # this filename has a semi-colon to check for + # regression of gh-616 (in pttree) + src_fn = 'D:\\window~1\\path\\000\\pttree' + src_path = '/' + + argv = ['pttree', src_fn + ':' + src_path] + with patch.object(sys, 'argv', argv): + pttree.main() + + args, kwargs = mock_open_file.call_args_list[0] + self.assertEqual(args, (src_fn, 'r')) + + +def suite(): + theSuite = common.unittest.TestSuite() + + theSuite.addTest(common.unittest.makeSuite(ptrepackTestCase)) + theSuite.addTest(common.unittest.makeSuite(ptdumpTestCase)) + theSuite.addTest(common.unittest.makeSuite(pttreeTestCase)) + + return theSuite + + +if __name__ == '__main__': + common.parse_argv(sys.argv) + common.print_versions() + common.unittest.main(defaultTest='suite') diff --git a/tables/tests/test_vlarray.py b/tables/tests/test_vlarray.py new file mode 100644 index 0000000..9210deb --- /dev/null +++ b/tables/tests/test_vlarray.py @@ -0,0 +1,4415 @@ +import sys + +import numpy as np + +import tables as tb +from tables.tests import common + + +class C: + c = (3, 4.5) + + +class BasicTestCase(common.TempFileMixin, common.PyTablesTestCase): + compress = 0 + complib = "zlib" + shuffle = 0 + bitshuffle = 0 + fletcher32 = 0 + flavor = "numpy" + + def setUp(self): + super().setUp() + + # Create an instance of an HDF5 Table + self.rootgroup = self.h5file.root + self.populateFile() + self.h5file.close() + + def populateFile(self): + group = self.rootgroup + filters = tb.Filters(complevel=self.compress, + complib=self.complib, + shuffle=self.shuffle, + bitshuffle=self.bitshuffle, + fletcher32=self.fletcher32) + vlarray = self.h5file.create_vlarray(group, 'vlarray1', + atom=tb.Int32Atom(), + title="ragged array of ints", + filters=filters, + expectedrows=1000) + vlarray.flavor = self.flavor + + # Fill it with 5 rows + vlarray.append([1, 2]) + if self.flavor == "numpy": + vlarray.append(np.array([3, 4, 5], dtype='int32')) + vlarray.append(np.array([], dtype='int32')) # Empty entry + elif self.flavor == "python": + vlarray.append((3, 4, 5)) + vlarray.append(()) # Empty entry + vlarray.append([6, 7, 8, 9]) + vlarray.append([10, 11, 12, 13, 14]) + + def test00_attributes(self): + self.h5file = tb.open_file(self.h5fname, "r") + obj = self.h5file.get_node("/vlarray1") + + self.assertEqual(obj.flavor, self.flavor) + self.assertEqual(obj.shape, (5,)) + self.assertEqual(obj.ndim, 1) + self.assertEqual(obj.nrows, 5) + self.assertEqual(obj.atom.type, 'int32') + + def test01_read(self): + """Checking vlarray read.""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test01_read..." % self.__class__.__name__) + + # Create an instance of an HDF5 Table + self.h5file = tb.open_file(self.h5fname, "r") + vlarray = self.h5file.get_node("/vlarray1") + + # Choose a small value for buffer size + vlarray.nrowsinbuf = 3 + # Read some rows + row = vlarray.read(0)[0] + row2 = vlarray.read(2)[0] + if common.verbose: + print("Flavor:", vlarray.flavor) + print("Nrows in", vlarray._v_pathname, ":", vlarray.nrows) + print("First row in vlarray ==>", row) + + nrows = 5 + self.assertEqual(nrows, vlarray.nrows) + if self.flavor == "numpy": + self.assertEqual(type(row), np.ndarray) + self.assertTrue(common.allequal( + row, np.array([1, 2], dtype='int32'), self.flavor)) + self.assertTrue(common.allequal( + row2, np.array([], dtype='int32'), self.flavor)) + elif self.flavor == "python": + self.assertEqual(row, [1, 2]) + self.assertEqual(row2, []) + self.assertEqual(len(row), 2) + + # Check filters: + if self.compress != vlarray.filters.complevel and common.verbose: + print("Error in compress. Class:", self.__class__.__name__) + print("self, vlarray:", self.compress, vlarray.filters.complevel) + self.assertEqual(vlarray.filters.complevel, self.compress) + if self.compress > 0 and tb.which_lib_version(self.complib): + self.assertEqual(vlarray.filters.complib, self.complib) + if self.shuffle != vlarray.filters.shuffle and common.verbose: + print("Error in shuffle. Class:", self.__class__.__name__) + print("self, vlarray:", self.shuffle, vlarray.filters.shuffle) + self.assertEqual(self.shuffle, vlarray.filters.shuffle) + if self.bitshuffle != vlarray.filters.bitshuffle and common.verbose: + print("Error in shuffle. Class:", self.__class__.__name__) + print("self, vlarray:", self.bitshuffle, + vlarray.filters.bitshuffle) + self.assertEqual(self.shuffle, vlarray.filters.shuffle) + if self.fletcher32 != vlarray.filters.fletcher32 and common.verbose: + print("Error in fletcher32. Class:", self.__class__.__name__) + print("self, vlarray:", self.fletcher32, + vlarray.filters.fletcher32) + self.assertEqual(self.fletcher32, vlarray.filters.fletcher32) + + def test02a_getitem(self): + """Checking vlarray __getitem__ (slices)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test02a_getitem..." % self.__class__.__name__) + + # Create an instance of an HDF5 Table + self.h5file = tb.open_file(self.h5fname, "r") + vlarray = self.h5file.get_node("/vlarray1") + + rows = [[1, 2], [3, 4, 5], [], [6, 7, 8, 9], [10, 11, 12, 13, 14]] + + slices = [ + slice(None, None, None), slice(1, 1, 1), slice(30, None, None), + slice(0, None, None), slice(3, None, 1), slice(3, None, 2), + slice(None, 1, None), slice(None, 2, 1), slice(None, 30, 2), + slice(None, None, 1), slice(None, None, 2), slice(None, None, 3), + ] + for slc in slices: + # Read the rows in slc + rows2 = vlarray[slc] + rows1 = rows[slc] + rows1f = [] + if common.verbose: + print("Flavor:", vlarray.flavor) + print("Nrows in", vlarray._v_pathname, ":", vlarray.nrows) + print("Original rows ==>", rows1) + print("Rows read in vlarray ==>", rows2) + + if self.flavor == "numpy": + for val in rows1: + rows1f.append(np.array(val, dtype='int32')) + for i in range(len(rows1f)): + self.assertTrue(common.allequal( + rows2[i], rows1f[i], self.flavor)) + elif self.flavor == "python": + self.assertEqual(rows2, rows1) + + def test02b_getitem(self): + """Checking vlarray __getitem__ (scalars)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test02b_getitem..." % self.__class__.__name__) + + if self.flavor != "numpy": + # This test is only valid for NumPy + return + + # Create an instance of an HDF5 Table + self.h5file = tb.open_file(self.h5fname, "r") + vlarray = self.h5file.get_node("/vlarray1") + + # Get a numpy array of objects + rows = np.array(vlarray[:], dtype=object) + + for slc in [0, np.array(1), 2, np.array([3]), [4]]: + # Read the rows in slc + rows2 = vlarray[slc] + rows1 = rows[slc] + if common.verbose: + print("Flavor:", vlarray.flavor) + print("Nrows in", vlarray._v_pathname, ":", vlarray.nrows) + print("Original rows ==>", rows1) + print("Rows read in vlarray ==>", rows2) + + for i in range(len(rows1)): + self.assertTrue(common.allequal( + rows2[i], rows1[i], self.flavor)) + + def test03_append(self): + """Checking vlarray append.""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test03_append..." % self.__class__.__name__) + + # Create an instance of an HDF5 Table + self.h5file = tb.open_file(self.h5fname, "a") + vlarray = self.h5file.get_node("/vlarray1") + + # Append a new row + vlarray.append([7, 8, 9, 10]) + + # Choose a small value for buffer size + vlarray.nrowsinbuf = 3 + + # Read some rows: + row1 = vlarray[0] + row2 = vlarray[2] + row3 = vlarray[-1] + if common.verbose: + print("Flavor:", vlarray.flavor) + print("Nrows in", vlarray._v_pathname, ":", vlarray.nrows) + print("First row in vlarray ==>", row1) + + nrows = 6 + self.assertEqual(nrows, vlarray.nrows) + if self.flavor == "numpy": + self.assertEqual(type(row1), type(np.array([1, 2]))) + self.assertTrue(common.allequal( + row1, np.array([1, 2], dtype='int32'), self.flavor)) + self.assertTrue(common.allequal( + row2, np.array([], dtype='int32'), self.flavor)) + self.assertTrue(common.allequal( + row3, np.array([7, 8, 9, 10], dtype='int32'), self.flavor)) + elif self.flavor == "python": + self.assertEqual(row1, [1, 2]) + self.assertEqual(row2, []) + self.assertEqual(row3, [7, 8, 9, 10]) + self.assertEqual(len(row3), 4) + + def test04_get_row_size(self): + """Checking get_row_size method.""" + + self.h5file = tb.open_file(self.h5fname, "a") + vlarray = self.h5file.get_node("/vlarray1") + + self.assertEqual(vlarray.get_row_size(0), 2 * vlarray.atom.size) + self.assertEqual(vlarray.get_row_size(1), 3 * vlarray.atom.size) + self.assertEqual(vlarray.get_row_size(2), 0 * vlarray.atom.size) + self.assertEqual(vlarray.get_row_size(3), 4 * vlarray.atom.size) + self.assertEqual(vlarray.get_row_size(4), 5 * vlarray.atom.size) + + +class BasicNumPyTestCase(BasicTestCase): + flavor = "numpy" + + +class BasicPythonTestCase(BasicTestCase): + flavor = "python" + + +class ZlibComprTestCase(BasicTestCase): + compress = 1 + complib = "zlib" + + +@common.unittest.skipIf(not common.blosc_avail, + 'BLOSC compression library not available') +class BloscComprTestCase(BasicTestCase): + compress = 9 + shuffle = 0 + complib = "blosc" + + +@common.unittest.skipIf(not common.blosc_avail, + 'BLOSC compression library not available') +class BloscShuffleComprTestCase(BasicTestCase): + compress = 6 + shuffle = 1 + complib = "blosc" + + +@common.unittest.skipIf(not common.blosc_avail, + 'BLOSC compression library not available') +@common.unittest.skipIf( + common.blosc_version < common.min_blosc_bitshuffle_version, + f'BLOSC >= {common.min_blosc_bitshuffle_version} required') +class BloscBitShuffleComprTestCase(BasicTestCase): + compress = 9 + bitshuffle = 1 + complib = "blosc" + + +@common.unittest.skipIf(not common.blosc_avail, + 'BLOSC compression library not available') +class BloscBloscLZComprTestCase(BasicTestCase): + compress = 9 + shuffle = 1 + complib = "blosc:blosclz" + + +@common.unittest.skipIf(not common.blosc_avail, + 'BLOSC compression library not available') +@common.unittest.skipIf( + 'lz4' not in tb.blosc_compressor_list(), 'lz4 required') +class BloscLZ4ComprTestCase(BasicTestCase): + compress = 9 + shuffle = 1 + complib = "blosc:lz4" + + +@common.unittest.skipIf(not common.blosc_avail, + 'BLOSC compression library not available') +@common.unittest.skipIf( + 'lz4' not in tb.blosc_compressor_list(), 'lz4 required') +class BloscLZ4HCComprTestCase(BasicTestCase): + compress = 9 + shuffle = 1 + complib = "blosc:lz4hc" + + +@common.unittest.skipIf(not common.blosc_avail, + 'BLOSC compression library not available') +@common.unittest.skipIf('snappy' not in tb.blosc_compressor_list(), + 'snappy required') +class BloscSnappyComprTestCase(BasicTestCase): + compress = 9 + shuffle = 1 + complib = "blosc:snappy" + + +@common.unittest.skipIf(not common.blosc_avail, + 'BLOSC compression library not available') +@common.unittest.skipIf( + 'zlib' not in tb.blosc_compressor_list(), 'zlib required') +class BloscZlibComprTestCase(BasicTestCase): + compress = 9 + shuffle = 1 + complib = "blosc:zlib" + + +@common.unittest.skipIf(not common.blosc_avail, + 'BLOSC compression library not available') +@common.unittest.skipIf( + 'zstd' not in tb.blosc_compressor_list(), 'zstd required') +class BloscZstdComprTestCase(BasicTestCase): + compress = 9 + shuffle = 1 + complib = "blosc:zstd" + + +@common.unittest.skipIf( + not common.lzo_avail, 'LZO compression library not available') +class LZOComprTestCase(BasicTestCase): + compress = 1 + complib = "lzo" + + +@common.unittest.skipIf(not common.bzip2_avail, + 'BZIP2 compression library not available') +class Bzip2ComprTestCase(BasicTestCase): + compress = 1 + complib = "bzip2" + + +class ShuffleComprTestCase(BasicTestCase): + compress = 1 + shuffle = 1 + + +class TypesTestCase(common.TempFileMixin, common.PyTablesTestCase): + open_mode = "w" + compress = 0 + complib = "zlib" # Default compression library + + def test01_StringAtom(self): + """Checking vlarray with NumPy string atoms ('numpy' flavor)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test01_StringAtom..." % self.__class__.__name__) + + vlarray = self.h5file.create_vlarray('/', 'stringAtom', + atom=tb.StringAtom(itemsize=3), + title="Ragged array of strings") + vlarray.flavor = "numpy" + vlarray.append(np.array(["1", "12", "123", "1234", "12345"])) + vlarray.append(np.array(["1", "12345"])) + + if self.reopen: + name = vlarray._v_pathname + self._reopen() + vlarray = self.h5file.get_node(name) + + # Read all the rows: + row = vlarray.read() + if common.verbose: + print("Object read:", row) + print("Nrows in", vlarray._v_pathname, ":", vlarray.nrows) + print("First row in vlarray ==>", row[0]) + + self.assertEqual(vlarray.nrows, 2) + np.testing.assert_array_equal( + row[0], np.array(["1", "12", "123", "123", "123"], 'S')) + np.testing.assert_array_equal(row[1], np.array(["1", "123"], 'S')) + self.assertEqual(len(row[0]), 5) + self.assertEqual(len(row[1]), 2) + + def test01a_StringAtom(self): + """Checking vlarray with NumPy string atoms ('numpy' flavor, + strided)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test01a_StringAtom..." % self.__class__.__name__) + + vlarray = self.h5file.create_vlarray('/', 'stringAtom', + atom=tb.StringAtom(itemsize=3), + title="Ragged array of strings") + vlarray.flavor = "numpy" + vlarray.append(np.array(["1", "12", "123", "1234", "12345"][::2])) + vlarray.append(np.array(["1", "12345", "2", "321"])[::3]) + + if self.reopen: + name = vlarray._v_pathname + self._reopen() + vlarray = self.h5file.get_node(name) + + # Read all the rows: + row = vlarray.read() + if common.verbose: + print("Object read:", row) + print("Nrows in", vlarray._v_pathname, ":", vlarray.nrows) + print("First row in vlarray ==>", row[0]) + + self.assertEqual(vlarray.nrows, 2) + np.testing.assert_array_equal(row[0], + np.array(["1", "123", "123"], 'S')) + np.testing.assert_array_equal(row[1], np.array(["1", "321"], 'S')) + self.assertEqual(len(row[0]), 3) + self.assertEqual(len(row[1]), 2) + + def test01a_2_StringAtom(self): + """Checking vlarray with NumPy string atoms (NumPy flavor, no conv)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test01a_2_StringAtom..." % + self.__class__.__name__) + + vlarray = self.h5file.create_vlarray('/', 'stringAtom', + atom=tb.StringAtom(itemsize=3), + title="Ragged array of strings") + vlarray.flavor = "numpy" + vlarray.append(np.array(["1", "12", "123", "123"])) + vlarray.append(np.array(["1", "2", "321"])) + + if self.reopen: + name = vlarray._v_pathname + self._reopen() + vlarray = self.h5file.get_node(name) + + # Read all the rows: + row = vlarray.read() + if common.verbose: + print("Object read:", row) + print("Nrows in", vlarray._v_pathname, ":", vlarray.nrows) + print("First row in vlarray ==>", row[0]) + + self.assertEqual(vlarray.nrows, 2) + np.testing.assert_array_equal( + row[0], np.array(["1", "12", "123", "123"], 'S')) + np.testing.assert_array_equal(row[1], np.array(["1", "2", "321"], 'S')) + self.assertEqual(len(row[0]), 4) + self.assertEqual(len(row[1]), 3) + + def test01b_StringAtom(self): + """Checking vlarray with NumPy string atoms (python flavor)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test01b_StringAtom..." % self.__class__.__name__) + + vlarray = self.h5file.create_vlarray('/', 'stringAtom2', + atom=tb.StringAtom(itemsize=3), + title="Ragged array of strings") + vlarray.flavor = "python" + vlarray.append(["1", "12", "123", "1234", "12345"]) + vlarray.append(["1", "12345"]) + + if self.reopen: + name = vlarray._v_pathname + self._reopen() + vlarray = self.h5file.get_node(name) + + # Read all the rows: + row = vlarray.read() + if common.verbose: + print("Testing String flavor") + print("Object read:", row) + print("Nrows in", vlarray._v_pathname, ":", vlarray.nrows) + print("First row in vlarray ==>", row[0]) + + self.assertEqual(vlarray.nrows, 2) + self.assertEqual(row[0], [b"1", b"12", b"123", b"123", b"123"]) + self.assertEqual(row[1], [b"1", b"123"]) + self.assertEqual(len(row[0]), 5) + self.assertEqual(len(row[1]), 2) + + def test01c_StringAtom(self): + """Checking updating vlarray with NumPy string atoms + ('numpy' flavor)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test01c_StringAtom..." % self.__class__.__name__) + + vlarray = self.h5file.create_vlarray('/', 'stringAtom', + atom=tb.StringAtom(itemsize=3), + title="Ragged array of strings") + vlarray.flavor = "numpy" + vlarray.append(np.array(["1", "12", "123", "1234", "12345"])) + vlarray.append(np.array(["1", "12345"])) + + # Modify the rows + vlarray[0] = np.array(["1", "123", "12", "", "12345"]) + vlarray[1] = np.array(["44", "4"]) # This should work as well + + if self.reopen: + name = vlarray._v_pathname + self._reopen() + vlarray = self.h5file.get_node(name) + + # Read all the rows: + row = vlarray.read() + if common.verbose: + print("Object read:", row) + print("Nrows in", vlarray._v_pathname, ":", vlarray.nrows) + print("First row in vlarray ==>", row[0]) + + self.assertEqual(vlarray.nrows, 2) + self.assertTrue(common.allequal( + row[0], np.array([b"1", b"123", b"12", b"", b"123"]))) + self.assertTrue(common.allequal( + row[1], np.array(["44", "4"], dtype="S3"))) + self.assertEqual(len(row[0]), 5) + self.assertEqual(len(row[1]), 2) + + def test01d_StringAtom(self): + """Checking updating vlarray with string atoms (String flavor)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test01d_StringAtom..." % self.__class__.__name__) + + vlarray = self.h5file.create_vlarray('/', 'stringAtom2', + atom=tb.StringAtom(itemsize=3), + title="Ragged array of strings") + vlarray.flavor = "python" + vlarray.append(["1", "12", "123", "1234", "12345"]) + vlarray.append(["1", "12345"]) + + # Modify the rows + vlarray[0] = ["1", "123", "12", "", "12345"] + vlarray[1] = ["44", "4"] + + if self.reopen: + name = vlarray._v_pathname + self._reopen() + vlarray = self.h5file.get_node(name) + + # Read all the rows: + row = vlarray.read() + if common.verbose: + print("Testing String flavor") + print("Object read:", row) + print("Nrows in", vlarray._v_pathname, ":", vlarray.nrows) + print("First row in vlarray ==>", row[0]) + + self.assertEqual(vlarray.nrows, 2) + self.assertEqual(row[0], [b"1", b"123", b"12", b"", b"123"]) + self.assertEqual(row[1], [b"44", b"4"]) + self.assertEqual(len(row[0]), 5) + self.assertEqual(len(row[1]), 2) + + def test02_BoolAtom(self): + """Checking vlarray with boolean atoms.""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test02_BoolAtom..." % self.__class__.__name__) + + vlarray = self.h5file.create_vlarray('/', 'BoolAtom', + atom=tb.BoolAtom(), + title="Ragged array of Booleans") + vlarray.append([1, 0, 3]) + vlarray.append([-1, 0]) + + if self.reopen: + name = vlarray._v_pathname + self._reopen() + vlarray = self.h5file.get_node(name) + + # Read all the rows: + row = vlarray.read() + if common.verbose: + print("Object read:", row) + print("Nrows in", vlarray._v_pathname, ":", vlarray.nrows) + print("First row in vlarray ==>", row[0]) + + self.assertEqual(vlarray.nrows, 2) + self.assertTrue(common.allequal( + row[0], np.array([1, 0, 1], dtype='bool'))) + self.assertTrue(common.allequal( + row[1], np.array([1, 0], dtype='bool'))) + self.assertEqual(len(row[0]), 3) + self.assertEqual(len(row[1]), 2) + + def test02b_BoolAtom(self): + """Checking setting vlarray with boolean atoms.""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test02b_BoolAtom..." % self.__class__.__name__) + + vlarray = self.h5file.create_vlarray('/', 'BoolAtom', + atom=tb.BoolAtom(), + title="Ragged array of Booleans") + vlarray.append([1, 0, 3]) + vlarray.append([-1, 0]) + + # Modify the rows + vlarray[0] = (0, 1, 3) + vlarray[1] = (0, -1) + + if self.reopen: + name = vlarray._v_pathname + self._reopen() + vlarray = self.h5file.get_node(name) + + # Read all the rows: + row = vlarray.read() + if common.verbose: + print("Object read:", row) + print("Nrows in", vlarray._v_pathname, ":", vlarray.nrows) + print("First row in vlarray ==>", row[0]) + + self.assertEqual(vlarray.nrows, 2) + self.assertTrue(common.allequal( + row[0], np.array([0, 1, 1], dtype='bool'))) + self.assertTrue(common.allequal( + row[1], np.array([0, 1], dtype='bool'))) + self.assertEqual(len(row[0]), 3) + self.assertEqual(len(row[1]), 2) + + def test03_IntAtom(self): + """Checking vlarray with integer atoms.""" + + ttypes = [ + "int8", + "uint8", + "int16", + "uint16", + "int32", + "uint32", + "int64", + # "UInt64", # Unavailable in some platforms + ] + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test03_IntAtom..." % self.__class__.__name__) + + for atype in ttypes: + vlarray = self.h5file.create_vlarray( + '/', atype, atom=tb.Atom.from_sctype(atype)) + vlarray.append([1, 2, 3]) + vlarray.append([-1, 0]) + + if self.reopen: + name = vlarray._v_pathname + self._reopen(mode='a') + vlarray = self.h5file.get_node(name) + + # Read all the rows: + row = vlarray.read() + if common.verbose: + print("Testing type:", atype) + print("Object read:", row) + print("Nrows in", vlarray._v_pathname, ":", vlarray.nrows) + print("First row in vlarray ==>", row[0]) + + self.assertEqual(vlarray.nrows, 2) + self.assertTrue( + common.allequal(row[0], np.array([1, 2, 3], dtype=atype))) + self.assertTrue( + common.allequal(row[1], np.array([-1, 0], dtype=atype))) + self.assertEqual(len(row[0]), 3) + self.assertEqual(len(row[1]), 2) + + def test03a_IntAtom(self): + """Checking vlarray with integer atoms (byteorder swapped)""" + + ttypes = { + "int8": np.int8, + "uint8": np.uint8, + "int16": np.int16, + "uint16": np.uint16, + "int32": np.int32, + "uint32": np.uint32, + "int64": np.int64, + # "UInt64": numpy.int64, # Unavailable in some platforms + } + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test03a_IntAtom..." % self.__class__.__name__) + + for atype in ttypes: + vlarray = self.h5file.create_vlarray( + '/', atype, atom=tb.Atom.from_sctype(ttypes[atype])) + a0 = np.array([1, 2, 3], dtype=atype) + a0 = a0.byteswap() + a0 = a0.newbyteorder() + vlarray.append(a0) + a1 = np.array([-1, 0], dtype=atype) + a1 = a1.byteswap() + a1 = a1.newbyteorder() + vlarray.append(a1) + + if self.reopen: + name = vlarray._v_pathname + self._reopen(mode='a') + vlarray = self.h5file.get_node(name) + + # Read all the rows: + row = vlarray.read() + if common.verbose: + print("Testing type:", atype) + print("Object read:", row) + print("Nrows in", vlarray._v_pathname, ":", vlarray.nrows) + print("First row in vlarray ==>", row[0]) + + self.assertEqual(vlarray.nrows, 2) + self.assertTrue(common.allequal( + row[0], np.array([1, 2, 3], dtype=ttypes[atype]))) + self.assertTrue(common.allequal( + row[1], np.array([-1, 0], dtype=ttypes[atype]))) + self.assertEqual(len(row[0]), 3) + self.assertEqual(len(row[1]), 2) + + def test03b_IntAtom(self): + """Checking updating vlarray with integer atoms.""" + + ttypes = [ + "int8", + "uint8", + "int16", + "uint16", + "int32", + "uint32", + "int64", + # "UInt64", # Unavailable in some platforms + ] + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test03_IntAtom..." % self.__class__.__name__) + + for atype in ttypes: + vlarray = self.h5file.create_vlarray( + '/', atype, atom=tb.Atom.from_sctype(atype)) + vlarray.append([1, 2, 3]) + vlarray.append([-1, 0]) + + # Modify rows + vlarray[0] = (3, 2, 1) + vlarray[1] = (0, -1) + + if self.reopen: + name = vlarray._v_pathname + self._reopen(mode='a') + vlarray = self.h5file.get_node(name) + + # Read all the rows: + row = vlarray.read() + if common.verbose: + print("Testing type:", atype) + print("Object read:", row) + print("Nrows in", vlarray._v_pathname, ":", vlarray.nrows) + print("First row in vlarray ==>", row[0]) + + self.assertEqual(vlarray.nrows, 2) + self.assertTrue( + common.allequal(row[0], np.array([3, 2, 1], dtype=atype))) + self.assertTrue( + common.allequal(row[1], np.array([0, -1], dtype=atype))) + self.assertEqual(len(row[0]), 3) + self.assertEqual(len(row[1]), 2) + + def test03c_IntAtom(self): + """Checking updating vlarray with integer atoms (byteorder swapped)""" + + ttypes = { + "int8": np.int8, + "uint8": np.uint8, + "int16": np.int16, + "uint16": np.uint16, + "int32": np.int32, + "uint32": np.uint32, + "int64": np.int64, + # "UInt64": numpy.int64, # Unavailable in some platforms + } + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test03c_IntAtom..." % self.__class__.__name__) + + for atype in ttypes: + vlarray = self.h5file.create_vlarray( + '/', atype, atom=tb.Atom.from_sctype(ttypes[atype])) + a0 = np.array([1, 2, 3], dtype=atype) + vlarray.append(a0) + a1 = np.array([-1, 0], dtype=atype) + vlarray.append(a1) + + # Modify rows + a0 = np.array([3, 2, 1], dtype=atype) + a0 = a0.byteswap() + a0 = a0.newbyteorder() + vlarray[0] = a0 + a1 = np.array([0, -1], dtype=atype) + a1 = a1.byteswap() + a1 = a1.newbyteorder() + vlarray[1] = a1 + + if self.reopen: + name = vlarray._v_pathname + self._reopen(mode='a') + vlarray = self.h5file.get_node(name) + + # Read all the rows: + row = vlarray.read() + if common.verbose: + print("Testing type:", atype) + print("Object read:", row) + print("Nrows in", vlarray._v_pathname, ":", vlarray.nrows) + print("First row in vlarray ==>", row[0]) + + self.assertEqual(vlarray.nrows, 2) + self.assertTrue(common.allequal( + row[0], np.array([3, 2, 1], dtype=ttypes[atype]))) + self.assertTrue(common.allequal( + row[1], np.array([0, -1], dtype=ttypes[atype]))) + self.assertEqual(len(row[0]), 3) + self.assertEqual(len(row[1]), 2) + + def test03d_IntAtom(self): + """Checking updating vlarray with integer atoms (another byteorder)""" + + ttypes = { + "int8": np.int8, + "uint8": np.uint8, + "int16": np.int16, + "uint16": np.uint16, + "int32": np.int32, + "uint32": np.uint32, + "int64": np.int64, + # "UInt64": numpy.int64, # Unavailable in some platforms + } + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test03d_IntAtom..." % self.__class__.__name__) + + byteorder = {'little': 'big', 'big': 'little'}[sys.byteorder] + for atype in ttypes: + vlarray = self.h5file.create_vlarray( + '/', atype, atom=tb.Atom.from_sctype(ttypes[atype]), + byteorder=byteorder) + a0 = np.array([1, 2, 3], dtype=atype) + vlarray.append(a0) + a1 = np.array([-1, 0], dtype=atype) + vlarray.append(a1) + + # Modify rows + a0 = np.array([3, 2, 1], dtype=atype) + a0 = a0.byteswap() + a0 = a0.newbyteorder() + vlarray[0] = a0 + a1 = np.array([0, -1], dtype=atype) + a1 = a1.byteswap() + a1 = a1.newbyteorder() + vlarray[1] = a1 + + if self.reopen: + name = vlarray._v_pathname + self._reopen(mode='a') + vlarray = self.h5file.get_node(name) + + # Read all the rows: + row = vlarray.read() + if common.verbose: + print("Testing type:", atype) + print("Object read:", row) + print("Nrows in", vlarray._v_pathname, ":", vlarray.nrows) + print("First row in vlarray ==>", row[0]) + + byteorder2 = tb.utils.byteorders[row[0].dtype.byteorder] + if byteorder2 != "irrelevant": + self.assertEqual(tb.utils.byteorders[row[0].dtype.byteorder], + sys.byteorder) + self.assertEqual(vlarray.byteorder, byteorder) + self.assertEqual(vlarray.nrows, 2) + self.assertTrue(common.allequal( + row[0], np.array([3, 2, 1], dtype=ttypes[atype]))) + self.assertTrue(common.allequal( + row[1], np.array([0, -1], dtype=ttypes[atype]))) + self.assertEqual(len(row[0]), 3) + self.assertEqual(len(row[1]), 2) + + def test04_FloatAtom(self): + """Checking vlarray with floating point atoms.""" + + ttypes = [ + "float32", + "float64", + ] + for name in ("float16", "float96", "float128"): + atomname = name.capitalize() + 'Atom' + if hasattr(tb, atomname): + ttypes.append(name) + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test04_FloatAtom..." % self.__class__.__name__) + + for atype in ttypes: + vlarray = self.h5file.create_vlarray( + '/', atype, atom=tb.Atom.from_sctype(atype)) + vlarray.append([1.3, 2.2, 3.3]) + vlarray.append([-1.3e34, 1.e-32]) + + if self.reopen: + name = vlarray._v_pathname + self._reopen(mode='a') + vlarray = self.h5file.get_node(name) + + # Read all the rows: + row = vlarray.read() + if common.verbose: + print("Testing type:", atype) + print("Object read:", row) + print("Nrows in", vlarray._v_pathname, ":", vlarray.nrows) + print("First row in vlarray ==>", row[0]) + + self.assertEqual(vlarray.nrows, 2) + self.assertTrue(common.allequal( + row[0], np.array([1.3, 2.2, 3.3], atype))) + self.assertTrue(common.allequal( + row[1], np.array([-1.3e34, 1.e-32], atype))) + self.assertEqual(len(row[0]), 3) + self.assertEqual(len(row[1]), 2) + + def test04a_FloatAtom(self): + """Checking vlarray with float atoms (byteorder swapped)""" + + ttypes = { + "float32": np.float32, + "float64": np.float64, + } + if hasattr(tb, "Float16Atom"): + ttypes["float16"] = np.float16 + if hasattr(tb, "Float96Atom"): + ttypes["float96"] = np.float96 + if hasattr(tb, "Float128Atom"): + ttypes["float128"] = np.float128 + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test04a_FloatAtom..." % self.__class__.__name__) + + for atype in ttypes: + vlarray = self.h5file.create_vlarray( + '/', atype, atom=tb.Atom.from_sctype(ttypes[atype])) + a0 = np.array([1.3, 2.2, 3.3], dtype=atype) + a0 = a0.byteswap() + a0 = a0.newbyteorder() + vlarray.append(a0) + a1 = np.array([-1.3e34, 1.e-32], dtype=atype) + a1 = a1.byteswap() + a1 = a1.newbyteorder() + vlarray.append(a1) + + if self.reopen: + name = vlarray._v_pathname + self._reopen(mode='a') + vlarray = self.h5file.get_node(name) + + # Read all the rows: + row = vlarray.read() + if common.verbose: + print("Testing type:", atype) + print("Object read:", row) + print("Nrows in", vlarray._v_pathname, ":", vlarray.nrows) + print("First row in vlarray ==>", row[0]) + + self.assertEqual(vlarray.nrows, 2) + self.assertTrue(common.allequal( + row[0], np.array([1.3, 2.2, 3.3], dtype=ttypes[atype]))) + self.assertTrue(common.allequal( + row[1], np.array([-1.3e34, 1.e-32], dtype=ttypes[atype]))) + self.assertEqual(len(row[0]), 3) + self.assertEqual(len(row[1]), 2) + + def test04b_FloatAtom(self): + """Checking updating vlarray with floating point atoms.""" + + ttypes = [ + "float32", + "float64", + ] + for name in ("float16", "float96", "float128"): + atomname = name.capitalize() + 'Atom' + if hasattr(tb, atomname): + ttypes.append(name) + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test04b_FloatAtom..." % self.__class__.__name__) + + for atype in ttypes: + vlarray = self.h5file.create_vlarray( + '/', atype, atom=tb.Atom.from_sctype(atype)) + vlarray.append([1.3, 2.2, 3.3]) + vlarray.append([-1.3e34, 1.e-32]) + + # Modifiy some rows + vlarray[0] = (4.3, 2.2, 4.3) + vlarray[1] = (-1.1e34, 1.3e-32) + + if self.reopen: + name = vlarray._v_pathname + self._reopen(mode='a') + vlarray = self.h5file.get_node(name) + + # Read all the rows: + row = vlarray.read() + if common.verbose: + print("Testing type:", atype) + print("Object read:", row) + print("Nrows in", vlarray._v_pathname, ":", vlarray.nrows) + print("First row in vlarray ==>", row[0]) + + self.assertEqual(vlarray.nrows, 2) + self.assertTrue(common.allequal( + row[0], np.array([4.3, 2.2, 4.3], atype))) + self.assertTrue( + common.allequal(row[1], np.array([-1.1e34, 1.3e-32], atype))) + self.assertEqual(len(row[0]), 3) + self.assertEqual(len(row[1]), 2) + + def test04c_FloatAtom(self): + """Checking updating vlarray with float atoms (byteorder swapped)""" + + ttypes = { + "float32": np.float32, + "float64": np.float64, + } + if hasattr(tb, "Float16Atom"): + ttypes["float16"] = np.float16 + if hasattr(tb, "Float96Atom"): + ttypes["float96"] = np.float96 + if hasattr(tb, "Float128Atom"): + ttypes["float128"] = np.float128 + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test04c_FloatAtom..." % self.__class__.__name__) + + for atype in ttypes: + vlarray = self.h5file.create_vlarray( + '/', atype, atom=tb.Atom.from_sctype(ttypes[atype])) + a0 = np.array([1.3, 2.2, 3.3], dtype=atype) + vlarray.append(a0) + a1 = np.array([-1, 0], dtype=atype) + vlarray.append(a1) + + # Modify rows + a0 = np.array([4.3, 2.2, 4.3], dtype=atype) + a0 = a0.byteswap() + a0 = a0.newbyteorder() + vlarray[0] = a0 + a1 = np.array([-1.1e34, 1.3e-32], dtype=atype) + a1 = a1.byteswap() + a1 = a1.newbyteorder() + vlarray[1] = a1 + + if self.reopen: + name = vlarray._v_pathname + self._reopen(mode='a') + vlarray = self.h5file.get_node(name) + + # Read all the rows: + row = vlarray.read() + if common.verbose: + print("Testing type:", atype) + print("Object read:", row) + print("Nrows in", vlarray._v_pathname, ":", vlarray.nrows) + print("First row in vlarray ==>", row[0]) + + self.assertEqual(vlarray.nrows, 2) + self.assertTrue(common.allequal( + row[0], np.array([4.3, 2.2, 4.3], dtype=ttypes[atype]))) + self.assertTrue(common.allequal( + row[1], np.array([-1.1e34, 1.3e-32], dtype=ttypes[atype]))) + self.assertEqual(len(row[0]), 3) + self.assertEqual(len(row[1]), 2) + + def test04d_FloatAtom(self): + """Checking updating vlarray with float atoms (another byteorder)""" + + ttypes = { + "float32": np.float32, + "float64": np.float64, + } + if hasattr(tb, "Float16Atom"): + ttypes["float16"] = np.float16 + if hasattr(tb, "Float96Atom"): + ttypes["float96"] = np.float96 + if hasattr(tb, "Float128Atom"): + ttypes["float128"] = np.float128 + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test04d_FloatAtom..." % self.__class__.__name__) + + byteorder = {'little': 'big', 'big': 'little'}[sys.byteorder] + for atype in ttypes: + vlarray = self.h5file.create_vlarray( + '/', atype, atom=tb.Atom.from_sctype(ttypes[atype]), + byteorder=byteorder) + a0 = np.array([1.3, 2.2, 3.3], dtype=atype) + vlarray.append(a0) + a1 = np.array([-1, 0], dtype=atype) + vlarray.append(a1) + + # Modify rows + a0 = np.array([4.3, 2.2, 4.3], dtype=atype) + a0 = a0.byteswap() + a0 = a0.newbyteorder() + vlarray[0] = a0 + a1 = np.array([-1.1e34, 1.3e-32], dtype=atype) + a1 = a1.byteswap() + a1 = a1.newbyteorder() + vlarray[1] = a1 + + if self.reopen: + name = vlarray._v_pathname + self._reopen(mode='a') + vlarray = self.h5file.get_node(name) + + # Read all the rows: + row = vlarray.read() + if common.verbose: + print("Testing type:", atype) + print("Object read:", row) + print("Nrows in", vlarray._v_pathname, ":", vlarray.nrows) + print("First row in vlarray ==>", row[0]) + + self.assertEqual(vlarray.byteorder, byteorder) + self.assertEqual(tb.utils.byteorders[row[0].dtype.byteorder], + sys.byteorder) + self.assertEqual(vlarray.nrows, 2) + self.assertTrue(common.allequal( + row[0], np.array([4.3, 2.2, 4.3], dtype=ttypes[atype]))) + self.assertTrue(common.allequal( + row[1], np.array([-1.1e34, 1.3e-32], dtype=ttypes[atype]))) + self.assertEqual(len(row[0]), 3) + self.assertEqual(len(row[1]), 2) + + def test04_ComplexAtom(self): + """Checking vlarray with numerical complex atoms.""" + + ttypes = [ + "complex64", + "complex128", + ] + + if hasattr(tb, "Complex192Atom"): + ttypes.append("complex192") + if hasattr(tb, "Complex256Atom"): + ttypes.append("complex256") + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test04_ComplexAtom..." % self.__class__.__name__) + + for atype in ttypes: + vlarray = self.h5file.create_vlarray( + '/', atype, atom=tb.Atom.from_sctype(atype)) + vlarray.append([(1.3 + 0j), (0+2.2j), (3.3+3.3j)]) + vlarray.append([(0-1.3e34j), (1.e-32 + 0j)]) + + if self.reopen: + name = vlarray._v_pathname + self._reopen(mode='a') + vlarray = self.h5file.get_node(name) + + # Read all the rows: + row = vlarray.read() + if common.verbose: + print("Testing type:", atype) + print("Object read:", row) + print("Nrows in", vlarray._v_pathname, ":", vlarray.nrows) + print("First row in vlarray ==>", row[0]) + + self.assertEqual(vlarray.nrows, 2) + self.assertTrue(common.allequal( + row[0], np.array([(1.3 + 0j), (0+2.2j), (3.3+3.3j)], atype))) + self.assertTrue(common.allequal( + row[1], np.array([(0-1.3e34j), (1.e-32 + 0j)], atype))) + self.assertEqual(len(row[0]), 3) + self.assertEqual(len(row[1]), 2) + + def test04b_ComplexAtom(self): + """Checking modifying vlarray with numerical complex atoms.""" + + ttypes = [ + "complex64", + "complex128", + ] + + if hasattr(tb, "Complex192Atom"): + ttypes.append("complex192") + if hasattr(tb, "Complex256Atom"): + ttypes.append("complex256") + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test04b_ComplexAtom..." % + self.__class__.__name__) + + for atype in ttypes: + vlarray = self.h5file.create_vlarray( + '/', atype, atom=tb.Atom.from_sctype(atype)) + vlarray.append([(1.3 + 0j), (0+2.2j), (3.3+3.3j)]) + vlarray.append([(0-1.3e34j), (1.e-32 + 0j)]) + + # Modify the rows + vlarray[0] = ((1.4 + 0j), (0+4.2j), (3.3+4.3j)) + vlarray[1] = ((4-1.3e34j), (1.e-32 + 4j)) + + if self.reopen: + name = vlarray._v_pathname + self._reopen(mode='a') + vlarray = self.h5file.get_node(name) + + # Read all the rows: + row = vlarray.read() + if common.verbose: + print("Testing type:", atype) + print("Object read:", row) + print("Nrows in", vlarray._v_pathname, ":", vlarray.nrows) + print("First row in vlarray ==>", row[0]) + + self.assertEqual(vlarray.nrows, 2) + self.assertTrue(common.allequal( + row[0], np.array([(1.4 + 0j), (0+4.2j), (3.3+4.3j)], atype))) + self.assertTrue(common.allequal( + row[1], np.array([(4-1.3e34j), (1.e-32 + 4j)], atype))) + self.assertEqual(len(row[0]), 3) + self.assertEqual(len(row[1]), 2) + + def test05_VLStringAtom(self): + """Checking vlarray with variable length strings.""" + + # Skip the test if the default encoding has been mangled. + if sys.getdefaultencoding() != 'ascii': + return + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test05_VLStringAtom..." % + self.__class__.__name__) + + vlarray = self.h5file.create_vlarray( + '/', "VLStringAtom", atom=tb.VLStringAtom()) + vlarray.append(b"asd") + vlarray.append(b"asd\xe4") + vlarray.append(b"aaana") + vlarray.append(b"") + # Check for ticket #62. + self.assertRaises(TypeError, vlarray.append, [b"foo", b"bar"]) + # `VLStringAtom` makes no encoding assumptions. See ticket #51. + self.assertRaises(UnicodeEncodeError, vlarray.append, "asd\xe4") + + if self.reopen: + name = vlarray._v_pathname + self._reopen() + vlarray = self.h5file.get_node(name) + + # Read all the rows: + row = vlarray.read() + if common.verbose: + print("Object read:", row) + print("Nrows in", vlarray._v_pathname, ":", vlarray.nrows) + print("First row in vlarray ==>", row[0]) + + self.assertEqual(vlarray.nrows, 4) + self.assertEqual(row[0], b"asd") + self.assertEqual(row[1], b"asd\xe4") + self.assertEqual(row[2], b"aaana") + self.assertEqual(row[3], b"") + self.assertEqual(len(row[0]), 3) + self.assertEqual(len(row[1]), 4) + self.assertEqual(len(row[2]), 5) + self.assertEqual(len(row[3]), 0) + + def test05b_VLStringAtom(self): + """Checking updating vlarray with variable length strings.""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test05b_VLStringAtom..." % + self.__class__.__name__) + + vlarray = self.h5file.create_vlarray( + '/', "VLStringAtom", atom=tb.VLStringAtom()) + vlarray.append(b"asd") + vlarray.append(b"aaana") + + # Modify values + vlarray[0] = b"as4" + vlarray[1] = b"aaanc" + self.assertRaises(ValueError, vlarray.__setitem__, 1, b"shrt") + self.assertRaises(ValueError, vlarray.__setitem__, 1, b"toolong") + + if self.reopen: + name = vlarray._v_pathname + self._reopen() + vlarray = self.h5file.get_node(name) + + # Read all the rows: + row = vlarray.read() + if common.verbose: + print("Object read:", row) + print("Nrows in", vlarray._v_pathname, ":", vlarray.nrows) + print("First row in vlarray ==>", repr(row[0])) + print("Second row in vlarray ==>", repr(row[1])) + + self.assertEqual(vlarray.nrows, 2) + self.assertEqual(row[0], b"as4") + self.assertEqual(row[1], b"aaanc") + self.assertEqual(len(row[0]), 3) + self.assertEqual(len(row[1]), 5) + + def test06a_Object(self): + """Checking vlarray with object atoms.""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test06a_Object..." % self.__class__.__name__) + + vlarray = self.h5file.create_vlarray( + '/', "Object", atom=tb.ObjectAtom()) + vlarray.append( + [[1, 2, 3], "aaa", "aaa\xef\xbf\xbd\xef\xbf\xbd\xef\xbf\xbd"]) + vlarray.append([3, 4, C()]) + vlarray.append(42) + + if self.reopen: + name = vlarray._v_pathname + self._reopen() + vlarray = self.h5file.get_node(name) + + # Read all the rows: + row = vlarray.read() + if common.verbose: + print("Object read:", row) + print("Nrows in", vlarray._v_pathname, ":", vlarray.nrows) + print("First row in vlarray ==>", row[0]) + + self.assertEqual(vlarray.nrows, 3) + self.assertEqual( + row[0], + [[1, 2, 3], "aaa", "aaa\xef\xbf\xbd\xef\xbf\xbd\xef\xbf\xbd"]) + list1 = list(row[1]) + obj = list1.pop() + self.assertEqual(list1, [3, 4]) + self.assertEqual(obj.c, C().c) + self.assertEqual(row[2], 42) + self.assertEqual(len(row[0]), 3) + self.assertEqual(len(row[1]), 3) + self.assertRaises(TypeError, len, row[2]) + + def test06b_Object(self): + """Checking updating vlarray with object atoms.""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test06b_Object..." % self.__class__.__name__) + + vlarray = self.h5file.create_vlarray('/', "Object", + atom=tb.ObjectAtom()) + # When updating an object, this seems to change the number + # of bytes that pickle.dumps generates + # vlarray.append( + # ([1,2,3], "aaa", "aaa\xef\xbf\xbd\xef\xbf\xbd\xef\xbf\xbd")) + vlarray.append(([1, 2, 3], "aaa", "\xef\xbf\xbd\xef\xbf\xbd4")) + # vlarray.append([3,4, C()]) + vlarray.append([3, 4, [24]]) + + # Modify the rows + # vlarray[0] = ([1,2,4], "aa4", "aaa\xef\xbf\xbd\xef\xbf\xbd4") + vlarray[0] = ([1, 2, 4], "aa4", "\xef\xbf\xbd\xef\xbf\xbd5") + # vlarray[1] = (3,4, C()) + vlarray[1] = [4, 4, [24]] + + if self.reopen: + name = vlarray._v_pathname + self._reopen() + vlarray = self.h5file.get_node(name) + + # Read all the rows: + row = vlarray.read() + if common.verbose: + print("Object read:", row) + print("Nrows in", vlarray._v_pathname, ":", vlarray.nrows) + print("First row in vlarray ==>", row[0]) + + self.assertEqual(vlarray.nrows, 2) + self.assertEqual(row[0], + ([1, 2, 4], "aa4", "\xef\xbf\xbd\xef\xbf\xbd5")) + list1 = list(row[1]) + obj = list1.pop() + self.assertEqual(list1, [4, 4]) + + # self.assertEqual(obj.c, C().c) + self.assertEqual(obj, [24]) + self.assertEqual(len(row[0]), 3) + self.assertEqual(len(row[1]), 3) + + def test06c_Object(self): + """Checking vlarray with object atoms (numpy arrays as values)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test06c_Object..." % self.__class__.__name__) + + vlarray = self.h5file.create_vlarray('/', "Object", + atom=tb.ObjectAtom()) + vlarray.append(np.array([[1, 2], [0, 4]], 'i4')) + vlarray.append(np.array([0, 1, 2, 3], 'i8')) + vlarray.append(np.array(42, 'i1')) + + if self.reopen: + name = vlarray._v_pathname + self._reopen() + vlarray = self.h5file.get_node(name) + + # Read all the rows: + row = vlarray.read() + if common.verbose: + print("Object read:", row) + print("Nrows in", vlarray._v_pathname, ":", vlarray.nrows) + print("First row in vlarray ==>", row[0]) + + self.assertEqual(vlarray.nrows, 3) + self.assertTrue(common.allequal( + row[0], np.array([[1, 2], [0, 4]], 'i4'))) + self.assertTrue(common.allequal(row[1], np.array([0, 1, 2, 3], 'i8'))) + self.assertTrue(common.allequal(row[2], np.array(42, 'i1'))) + + def test06d_Object(self): + """Checking updating vlarray with object atoms (numpy arrays)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test06d_Object..." % self.__class__.__name__) + + vlarray = self.h5file.create_vlarray('/', "Object", + atom=tb.ObjectAtom()) + vlarray.append(np.array([[1, 2], [0, 4]], 'i4')) + vlarray.append(np.array([0, 1, 2, 3], 'i8')) + vlarray.append(np.array(42, 'i1')) + + # Modify the rows. Since PyTables 2.2.1 we use a binary + # pickle for arrays and ObjectAtoms, so the next should take + # the same space than the above. + vlarray[0] = np.array([[1, 0], [0, 4]], 'i4') + vlarray[1] = np.array([0, 1, 0, 3], 'i8') + vlarray[2] = np.array(22, 'i1') + + if self.reopen: + name = vlarray._v_pathname + self._reopen() + vlarray = self.h5file.get_node(name) + + # Read all the rows: + row = vlarray.read() + if common.verbose: + print("Object read:", row) + print("Nrows in", vlarray._v_pathname, ":", vlarray.nrows) + print("First row in vlarray ==>", row[0]) + + self.assertEqual(vlarray.nrows, 3) + self.assertTrue(common.allequal( + row[0], np.array([[1, 0], [0, 4]], 'i4'))) + self.assertTrue(common.allequal( + row[1], np.array([0, 1, 0, 3], 'i8'))) + self.assertTrue(common.allequal(row[2], np.array(22, 'i1'))) + + def test07_VLUnicodeAtom(self): + """Checking vlarray with variable length Unicode strings.""" + + # Skip the test if the default encoding has been mangled. + if sys.getdefaultencoding() != 'ascii': + return + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test07_VLUnicodeAtom..." % + self.__class__.__name__) + + vlarray = self.h5file.create_vlarray( + '/', "VLUnicodeAtom", atom=tb.VLUnicodeAtom()) + vlarray.append("asd") + vlarray.append("asd\u0140") + vlarray.append("aaana") + vlarray.append("") + # Check for ticket #62. + self.assertRaises(TypeError, vlarray.append, ["foo", "bar"]) + # `VLUnicodeAtom` makes no encoding assumptions. + self.assertRaises(UnicodeDecodeError, vlarray.append, "asd\xe4") + + if self.reopen: + name = vlarray._v_pathname + self._reopen() + vlarray = self.h5file.get_node(name) + + # Read all the rows: + row = vlarray.read() + if common.verbose: + print("Object read:", row) + print("Nrows in", vlarray._v_pathname, ":", vlarray.nrows) + print("First row in vlarray ==>", row[0]) + + self.assertEqual(vlarray.nrows, 4) + self.assertEqual(row[0], "asd") + self.assertEqual(row[1], "asd\u0140") + self.assertEqual(row[2], "aaana") + self.assertEqual(row[3], "") + self.assertEqual(len(row[0]), 3) + self.assertEqual(len(row[1]), 4) + self.assertEqual(len(row[2]), 5) + self.assertEqual(len(row[3]), 0) + + def test07b_VLUnicodeAtom(self): + """Checking updating vlarray with variable length Unicode strings.""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test07b_VLUnicodeAtom..." % + self.__class__.__name__) + + vlarray = self.h5file.create_vlarray( + '/', "VLUnicodeAtom", atom=tb.VLUnicodeAtom()) + vlarray.append("asd") + vlarray.append("aaan\xe4") + + # Modify values + vlarray[0] = "as\xe4" + vlarray[1] = "aaan\u0140" + self.assertRaises(ValueError, vlarray.__setitem__, 1, "shrt") + self.assertRaises(ValueError, vlarray.__setitem__, 1, "toolong") + + if self.reopen: + name = vlarray._v_pathname + self._reopen() + vlarray = self.h5file.get_node(name) + + # Read all the rows: + row = vlarray.read() + if common.verbose: + print("Object read:", row) + print("Nrows in", vlarray._v_pathname, ":", vlarray.nrows) + print("First row in vlarray ==>", repr(row[0])) + print("Second row in vlarray ==>", repr(row[1])) + + self.assertEqual(vlarray.nrows, 2) + self.assertEqual(row[0], "as\xe4") + self.assertEqual(row[1], "aaan\u0140") + self.assertEqual(len(row[0]), 3) + self.assertEqual(len(row[1]), 5) + + +class TypesReopenTestCase(TypesTestCase): + title = "Reopen" + reopen = True + + +class TypesNoReopenTestCase(TypesTestCase): + title = "No reopen" + reopen = False + + +class MDTypesTestCase(common.TempFileMixin, common.PyTablesTestCase): + open_mode = "w" + compress = 0 + complib = "zlib" # Default compression library + + def setUp(self): + super().setUp() + self.rootgroup = self.h5file.root + + def test01_StringAtom(self): + """Checking vlarray with MD NumPy string atoms.""" + + root = self.rootgroup + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test01_StringAtom..." % self.__class__.__name__) + + # Create an string atom + vlarray = self.h5file.create_vlarray( + root, 'stringAtom', tb.StringAtom(itemsize=3, shape=(2,)), + "Ragged array of strings") + vlarray.append([["123", "45"], ["45", "123"]]) + vlarray.append([["s", "abc"], ["abc", "f"], + ["s", "ab"], ["ab", "f"]]) + + # Read all the rows: + row = vlarray.read() + if common.verbose: + print("Object read:", row) + print("Nrows in", vlarray._v_pathname, ":", vlarray.nrows) + print("Second row in vlarray ==>", row[1]) + + self.assertEqual(vlarray.nrows, 2) + np.testing.assert_array_equal( + row[0], np.array([["123", "45"], ["45", "123"]], 'S')) + np.testing.assert_array_equal( + row[1], np.array([["s", "abc"], ["abc", "f"], + ["s", "ab"], ["ab", "f"]], 'S')) + self.assertEqual(len(row[0]), 2) + self.assertEqual(len(row[1]), 4) + + def test01b_StringAtom(self): + """Checking vlarray with MD NumPy string atoms ('python' flavor)""" + + root = self.rootgroup + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test01b_StringAtom..." % self.__class__.__name__) + + # Create an string atom + vlarray = self.h5file.create_vlarray( + root, 'stringAtom', tb.StringAtom(itemsize=3, shape=(2,)), + "Ragged array of strings") + vlarray.flavor = "python" + vlarray.append([["123", "45"], ["45", "123"]]) + vlarray.append([["s", "abc"], ["abc", "f"], + ["s", "ab"], ["ab", "f"]]) + + # Read all the rows: + row = vlarray.read() + if common.verbose: + print("Object read:", row) + print("Nrows in", vlarray._v_pathname, ":", vlarray.nrows) + print("Second row in vlarray ==>", row[1]) + + self.assertEqual(vlarray.nrows, 2) + self.assertEqual(row[0], [[b"123", b"45"], [b"45", b"123"]]) + self.assertEqual(row[1], [[b"s", b"abc"], [b"abc", b"f"], + [b"s", b"ab"], [b"ab", b"f"]]) + self.assertEqual(len(row[0]), 2) + self.assertEqual(len(row[1]), 4) + + def test01c_StringAtom(self): + """Checking vlarray with MD NumPy string atoms (with offset)""" + + root = self.rootgroup + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test01c_StringAtom..." % self.__class__.__name__) + + # Create an string atom + vlarray = self.h5file.create_vlarray( + root, 'stringAtom', tb.StringAtom(itemsize=3, shape=(2,)), + "Ragged array of strings") + vlarray.flavor = "python" + a = np.array([["a", "b"], ["123", "45"], ["45", "123"]], dtype="S3") + vlarray.append(a[1:]) + a = np.array([["s", "a"], ["ab", "f"], + ["s", "abc"], ["abc", "f"], + ["s", "ab"], ["ab", "f"]]) + vlarray.append(a[2:]) + + # Read all the rows: + row = vlarray.read() + if common.verbose: + print("Object read:", row) + print("Nrows in", vlarray._v_pathname, ":", vlarray.nrows) + print("Second row in vlarray ==>", row[1]) + + self.assertEqual(vlarray.nrows, 2) + self.assertEqual(row[0], [[b"123", b"45"], [b"45", b"123"]]) + self.assertEqual(row[1], [[b"s", b"abc"], [b"abc", b"f"], + [b"s", b"ab"], [b"ab", b"f"]]) + self.assertEqual(len(row[0]), 2) + self.assertEqual(len(row[1]), 4) + + def test01d_StringAtom(self): + """Checking vlarray with MD NumPy string atoms (with stride)""" + + root = self.rootgroup + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test01d_StringAtom..." % self.__class__.__name__) + + # Create an string atom + vlarray = self.h5file.create_vlarray( + root, 'stringAtom', tb.StringAtom(itemsize=3, shape=(2,)), + "Ragged array of strings") + vlarray.flavor = "python" + a = np.array([["a", "b"], ["123", "45"], ["45", "123"]], dtype="S3") + vlarray.append(a[1::2]) + a = np.array([["s", "a"], ["ab", "f"], + ["s", "abc"], ["abc", "f"], + ["s", "ab"], ["ab", "f"]]) + vlarray.append(a[::3]) + + # Read all the rows: + row = vlarray.read() + if common.verbose: + print("Object read:", row) + print("Nrows in", vlarray._v_pathname, ":", vlarray.nrows) + print("Second row in vlarray ==>", row[1]) + + self.assertEqual(vlarray.nrows, 2) + self.assertEqual(row[0], [[b"123", b"45"]]) + self.assertEqual(row[1], [[b"s", b"a"], [b"abc", b"f"]]) + self.assertEqual(len(row[0]), 1) + self.assertEqual(len(row[1]), 2) + + def test02_BoolAtom(self): + """Checking vlarray with MD boolean atoms.""" + + root = self.rootgroup + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test02_BoolAtom..." % self.__class__.__name__) + + # Create an string atom + vlarray = self.h5file.create_vlarray(root, 'BoolAtom', + tb.BoolAtom(shape=(3,)), + "Ragged array of Booleans") + vlarray.append([(1, 0, 3), (1, 1, 1), (0, 0, 0)]) + vlarray.append([(-1, 0, 0)]) + + # Read all the rows: + row = vlarray.read() + if common.verbose: + print("Object read:", row) + print("Nrows in", vlarray._v_pathname, ":", vlarray.nrows) + print("Second row in vlarray ==>", row[1]) + + self.assertEqual(vlarray.nrows, 2) + self.assertTrue(common.allequal( + row[0], np.array([[1, 0, 1], [1, 1, 1], [0, 0, 0]], dtype='bool'))) + self.assertTrue(common.allequal( + row[1], np.array([[1, 0, 0]], dtype='bool'))) + self.assertEqual(len(row[0]), 3) + self.assertEqual(len(row[1]), 1) + + def test02b_BoolAtom(self): + """Checking vlarray with MD boolean atoms (with offset)""" + + root = self.rootgroup + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test02b_BoolAtom..." % self.__class__.__name__) + + # Create an string atom + vlarray = self.h5file.create_vlarray(root, 'BoolAtom', + tb.BoolAtom(shape=(3,)), + "Ragged array of Booleans") + a = np.array( + [(0, 0, 0), (1, 0, 3), (1, 1, 1), (0, 0, 0)], dtype='bool') + vlarray.append(a[1:]) # Create an offset + a = np.array([(1, 1, 1), (-1, 0, 0)], dtype='bool') + vlarray.append(a[1:]) # Create an offset + + # Read all the rows: + row = vlarray.read() + if common.verbose: + print("Object read:", row) + print("Nrows in", vlarray._v_pathname, ":", vlarray.nrows) + print("Second row in vlarray ==>", row[1]) + + self.assertEqual(vlarray.nrows, 2) + self.assertTrue(common.allequal( + row[0], np.array([[1, 0, 1], [1, 1, 1], [0, 0, 0]], dtype='bool'))) + self.assertTrue(common.allequal( + row[1], np.array([[1, 0, 0]], dtype='bool'))) + self.assertEqual(len(row[0]), 3) + self.assertEqual(len(row[1]), 1) + + def test02c_BoolAtom(self): + """Checking vlarray with MD boolean atoms (with strides)""" + + root = self.rootgroup + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test02c_BoolAtom..." % self.__class__.__name__) + + # Create an string atom + vlarray = self.h5file.create_vlarray(root, 'BoolAtom', + tb.BoolAtom(shape=(3,)), + "Ragged array of Booleans") + a = np.array( + [(0, 0, 0), (1, 0, 3), (1, 1, 1), (0, 0, 0)], dtype='bool') + vlarray.append(a[1::2]) # Create an strided array + a = np.array([(1, 1, 1), (-1, 0, 0), (0, 0, 0)], dtype='bool') + vlarray.append(a[::2]) # Create an strided array + + # Read all the rows: + row = vlarray.read() + if common.verbose: + print("Object read:", row) + print("Nrows in", vlarray._v_pathname, ":", vlarray.nrows) + print("Second row in vlarray ==>", row[1]) + + self.assertEqual(vlarray.nrows, 2) + self.assertTrue(common.allequal( + row[0], np.array([[1, 0, 1], [0, 0, 0]], dtype='bool'))) + self.assertTrue(common.allequal( + row[1], np.array([[1, 1, 1], [0, 0, 0]], dtype='bool'))) + self.assertEqual(len(row[0]), 2) + self.assertEqual(len(row[1]), 2) + + def test03_IntAtom(self): + """Checking vlarray with MD integer atoms.""" + + ttypes = [ + "int8", + "uint8", + "int16", + "uint16", + "int32", + "uint32", + "int64", + # "UInt64", # Unavailable in some platforms + ] + root = self.rootgroup + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test03_IntAtom..." % self.__class__.__name__) + + # Create an string atom + for atype in ttypes: + vlarray = self.h5file.create_vlarray( + root, atype, atom=tb.Atom.from_sctype(atype, (2, 3))) + vlarray.append([np.ones((2, 3), atype), np.zeros((2, 3), atype)]) + vlarray.append([np.ones((2, 3), atype)*100]) + + # Read all the rows: + row = vlarray.read() + if common.verbose: + print("Testing type:", atype) + print("Nrows in", vlarray._v_pathname, ":", vlarray.nrows) + print("Second row in vlarray ==>", repr(row[1])) + + self.assertEqual(vlarray.nrows, 2) + self.assertTrue(common.allequal( + row[0], np.array([np.ones((2, 3)), np.zeros((2, 3))], atype))) + self.assertTrue(common.allequal( + row[1], np.array([np.ones((2, 3)) * 100], atype))) + self.assertEqual(len(row[0]), 2) + self.assertEqual(len(row[1]), 1) + + def test04_FloatAtom(self): + """Checking vlarray with MD floating point atoms.""" + + ttypes = [ + "float32", + "float64", + "complex64", + "complex128", + ] + + for name in ("float16", "float96", "float128"): + atomname = name.capitalize() + "Atom" + if hasattr(tb, atomname): + ttypes.append(name) + for itemsize in (192, 256): + atomname = "Complex%dAtom" % itemsize + if hasattr(tb, atomname): + ttypes.append("complex%d" % (itemsize)) + + root = self.rootgroup + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test04_FloatAtom..." % self.__class__.__name__) + + # Create an string atom + for atype in ttypes: + vlarray = self.h5file.create_vlarray( + root, atype, atom=tb.Atom.from_sctype(atype, (5, 2, 6))) + vlarray.append([np.ones((5, 2, 6), atype)*1.3, + np.zeros((5, 2, 6), atype)]) + vlarray.append([np.ones((5, 2, 6), atype)*2.e4]) + + # Read all the rows: + row = vlarray.read() + if common.verbose: + print("Testing type:", atype) + print("Nrows in", vlarray._v_pathname, ":", vlarray.nrows) + print("Second row in vlarray ==>", row[1]) + + self.assertEqual(vlarray.nrows, 2) + self.assertTrue(common.allequal( + row[0], np.array( + [np.ones((5, 2, 6)) * 1.3, np.zeros((5, 2, 6))], atype))) + self.assertTrue(common.allequal( + row[1], np.array([np.ones((5, 2, 6)) * 2.e4], atype))) + self.assertEqual(len(row[0]), 2) + self.assertEqual(len(row[1]), 1) + + +class MDTypesNumPyTestCase(MDTypesTestCase): + title = "MDTypes" + + +class AppendShapeTestCase(common.TempFileMixin, common.PyTablesTestCase): + open_mode = "w" + + def setUp(self): + super().setUp() + self.rootgroup = self.h5file.root + + def test00_difinputs(self): + """Checking vlarray.append() with different inputs.""" + + root = self.rootgroup + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test00_difinputs..." % self.__class__.__name__) + + # Create an string atom + vlarray = self.h5file.create_vlarray(root, 'vlarray', + tb.Int32Atom(), + "Ragged array of ints") + vlarray.flavor = "python" + + # Check different ways to input + # All of the next should lead to the same rows + vlarray.append((1, 2, 3)) # a tuple + vlarray.append([1, 2, 3]) # a unique list + vlarray.append(np.array([1, 2, 3], dtype='int32')) # and array + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen() + vlarray = self.h5file.root.vlarray + + # Read all the vlarray + row = vlarray.read() + if common.verbose: + print("Object read:", row) + print("Nrows in", vlarray._v_pathname, ":", vlarray.nrows) + print("First row in vlarray ==>", row[0]) + + self.assertEqual(vlarray.nrows, 3) + self.assertEqual(row[0], [1, 2, 3]) + self.assertEqual(row[1], [1, 2, 3]) + self.assertEqual(row[2], [1, 2, 3]) + + def test01_toomanydims(self): + """Checking vlarray.append() with too many dimensions.""" + + root = self.rootgroup + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test01_toomanydims..." % self.__class__.__name__) + + # Create an string atom + vlarray = self.h5file.create_vlarray(root, 'vlarray', + tb.StringAtom(itemsize=3), + "Ragged array of strings") + # Adding an array with one dimensionality more than allowed + with self.assertRaises(ValueError): + vlarray.append([["123", "456", "3"]]) + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen() + vlarray = self.h5file.root.vlarray + + # Read all the rows (there should be none) + row = vlarray.read() + if common.verbose: + print("Object read:", row) + print("Nrows in", vlarray._v_pathname, ":", vlarray.nrows) + + self.assertEqual(vlarray.nrows, 0) + + def test02_zerodims(self): + """Checking vlarray.append() with a zero-dimensional array""" + + root = self.rootgroup + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test02_zerodims..." % self.__class__.__name__) + + # Create an string atom + vlarray = self.h5file.create_vlarray(root, 'vlarray', + tb.Int32Atom(), + "Ragged array of ints") + vlarray.append(np.zeros(dtype='int32', shape=(6, 0))) + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen() + vlarray = self.h5file.root.vlarray + + # Read the only row in vlarray + row = vlarray.read(0)[0] + if common.verbose: + print("Object read:", row) + print("Nrows in", vlarray._v_pathname, ":", vlarray.nrows) + print("First row in vlarray ==>", repr(row)) + + self.assertEqual(vlarray.nrows, 1) + self.assertTrue(common.allequal( + row, np.zeros(dtype='int32', shape=(0,)))) + self.assertEqual(len(row), 0) + + def test03a_cast(self): + """Checking vlarray.append() with a casted array (upgrading case)""" + + root = self.rootgroup + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test03a_cast..." % self.__class__.__name__) + + # Create an string atom + vlarray = self.h5file.create_vlarray(root, 'vlarray', + tb.Int32Atom(), + "Ragged array of ints") + # This type has to be upgraded + vlarray.append(np.array([1, 2], dtype='int16')) + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen() + vlarray = self.h5file.root.vlarray + + # Read the only row in vlarray + row = vlarray.read(0)[0] + if common.verbose: + print("Object read:", row) + print("Nrows in", vlarray._v_pathname, ":", vlarray.nrows) + print("First row in vlarray ==>", repr(row)) + + self.assertEqual(vlarray.nrows, 1) + self.assertTrue(common.allequal(row, np.array([1, 2], dtype='int32'))) + self.assertEqual(len(row), 2) + + def test03b_cast(self): + """Checking vlarray.append() with a casted array (downgrading case)""" + + root = self.rootgroup + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test03b_cast..." % self.__class__.__name__) + + # Create an string atom + vlarray = self.h5file.create_vlarray(root, 'vlarray', + tb.Int32Atom(), + "Ragged array of ints") + # This type has to be downcasted + vlarray.append(np.array([1, 2], dtype='float64')) + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen() + vlarray = self.h5file.root.vlarray + + # Read the only row in vlarray + row = vlarray.read(0)[0] + if common.verbose: + print("Object read:", row) + print("Nrows in", vlarray._v_pathname, ":", vlarray.nrows) + print("First row in vlarray ==>", repr(row)) + + self.assertEqual(vlarray.nrows, 1) + self.assertTrue(common.allequal(row, np.array([1, 2], dtype='int32'))) + self.assertEqual(len(row), 2) + + +class OpenAppendShapeTestCase(AppendShapeTestCase): + close = 0 + + +class CloseAppendShapeTestCase(AppendShapeTestCase): + close = 1 + + +class FlavorTestCase(common.TempFileMixin, common.PyTablesTestCase): + open_mode = "w" + compress = 0 + complib = "zlib" # Default compression library + + def setUp(self): + super().setUp() + self.rootgroup = self.h5file.root + + def test01a_EmptyVLArray(self): + """Checking empty vlarrays with different flavors (closing the file)""" + + root = self.rootgroup + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test01_EmptyVLArray..." % + self.__class__.__name__) + + # Create an string atom + vlarray = self.h5file.create_vlarray( + root, "vlarray", tb.Atom.from_kind('int', itemsize=4)) + vlarray.flavor = self.flavor + self.h5file.close() + self.h5file = tb.open_file(self.h5fname, "r") + + # Read all the rows (it should be empty): + vlarray = self.h5file.root.vlarray + row = vlarray.read() + if common.verbose: + print("Testing flavor:", self.flavor) + print("Object read:", row, repr(row)) + print("Nrows in", vlarray._v_pathname, ":", vlarray.nrows) + + # Check that the object read is effectively empty + self.assertEqual(vlarray.nrows, 0) + self.assertEqual(row, []) + + def test01b_EmptyVLArray(self): + """Checking empty vlarrays with different flavors (no closing file)""" + + root = self.rootgroup + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test01_EmptyVLArray..." % + self.__class__.__name__) + + # Create an string atom + vlarray = self.h5file.create_vlarray( + root, "vlarray", tb.Atom.from_kind('int', itemsize=4)) + vlarray.flavor = self.flavor + + # Read all the rows (it should be empty): + row = vlarray.read() + if common.verbose: + print("Testing flavor:", self.flavor) + print("Object read:", row) + print("Nrows in", vlarray._v_pathname, ":", vlarray.nrows) + + # Check that the object read is effectively empty + self.assertEqual(vlarray.nrows, 0) + self.assertEqual(row, []) + + def test02_BooleanAtom(self): + """Checking vlarray with different flavors (boolean versions)""" + + root = self.rootgroup + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test02_BoolAtom..." % self.__class__.__name__) + + # Create an string atom + vlarray = self.h5file.create_vlarray(root, "Bool", tb.BoolAtom()) + vlarray.flavor = self.flavor + vlarray.append([1, 2, 3]) + vlarray.append(()) # Empty row + vlarray.append([100, 0]) + + # Read all the rows: + row = vlarray.read() + if common.verbose: + print("Testing flavor:", self.flavor) + print("Object read:", row) + print("Nrows in", vlarray._v_pathname, ":", vlarray.nrows) + print("First row in vlarray ==>", row[0]) + + self.assertEqual(vlarray.nrows, 3) + self.assertEqual(len(row[0]), 3) + self.assertEqual(len(row[1]), 0) + self.assertEqual(len(row[2]), 2) + if self.flavor == "python": + arr1 = [1, 1, 1] + arr2 = [] + arr3 = [1, 0] + elif self.flavor == "numpy": + arr1 = np.array([1, 1, 1], dtype="bool") + arr2 = np.array([], dtype="bool") + arr3 = np.array([1, 0], dtype="bool") + + if self.flavor == "numpy": + self.assertTrue(common.allequal(row[0], arr1, self.flavor)) + self.assertTrue(common.allequal(row[1], arr2, self.flavor)) + self.assertTrue(common.allequal(row[1], arr2, self.flavor)) + else: + # 'python' flavor + self.assertEqual(row[0], arr1) + self.assertEqual(row[1], arr2) + self.assertEqual(row[2], arr3) + + def test03_IntAtom(self): + """Checking vlarray with different flavors (integer versions)""" + + ttypes = [ + "int8", + "uint8", + "int16", + "uint16", + "int32", + "uint32", + "int64", + # Not checked because some platforms does not support it + # "UInt64", + ] + + root = self.rootgroup + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test03_IntAtom..." % self.__class__.__name__) + + # Create an string atom + for atype in ttypes: + vlarray = self.h5file.create_vlarray(root, atype, + tb.Atom.from_sctype(atype)) + vlarray.flavor = self.flavor + vlarray.append([1, 2, 3]) + vlarray.append(()) + vlarray.append([100, 0]) + + # Read all the rows: + row = vlarray.read() + if common.verbose: + print("Testing flavor:", self.flavor) + print("Object read:", row) + print("Nrows in", vlarray._v_pathname, ":", vlarray.nrows) + print("First row in vlarray ==>", row[0]) + + self.assertEqual(vlarray.nrows, 3) + self.assertEqual(len(row[0]), 3) + self.assertEqual(len(row[1]), 0) + self.assertEqual(len(row[2]), 2) + if self.flavor == "python": + arr1 = [1, 2, 3] + arr2 = [] + arr3 = [100, 0] + elif self.flavor == "numpy": + arr1 = np.array([1, 2, 3], dtype=atype) + arr2 = np.array([], dtype=atype) + arr3 = np.array([100, 0], dtype=atype) + + if self.flavor == "numpy": + self.assertTrue(common.allequal(row[0], arr1, self.flavor)) + self.assertTrue(common.allequal(row[1], arr2, self.flavor)) + self.assertTrue(common.allequal(row[2], arr3, self.flavor)) + else: + # "python" flavor + self.assertEqual(row[0], arr1) + self.assertEqual(row[1], arr2) + self.assertEqual(row[2], arr3) + + def test03b_IntAtom(self): + """Checking vlarray flavors (integer versions and closed file)""" + + ttypes = [ + "int8", + "uint8", + "int16", + "uint16", + "int32", + "uint32", + "int64", + # Not checked because some platforms does not support it + # "UInt64", + ] + + root = self.rootgroup + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test03_IntAtom..." % self.__class__.__name__) + + # Create an string atom + for atype in ttypes: + vlarray = self.h5file.create_vlarray(root, atype, + tb.Atom.from_sctype(atype)) + vlarray.flavor = self.flavor + vlarray.append([1, 2, 3]) + vlarray.append(()) + vlarray.append([100, 0]) + self._reopen(mode='a') # open in "a"ppend mode + root = self.h5file.root # Very important! + vlarray = self.h5file.get_node(root, str(atype)) + + # Read all the rows: + row = vlarray.read() + if common.verbose: + print("Testing flavor:", self.flavor) + print("Object read:", row) + print("Nrows in", vlarray._v_pathname, ":", vlarray.nrows) + print("First row in vlarray ==>", row[0]) + + self.assertEqual(vlarray.nrows, 3) + self.assertEqual(len(row[0]), 3) + self.assertEqual(len(row[1]), 0) + self.assertEqual(len(row[2]), 2) + if self.flavor == "python": + arr1 = [1, 2, 3] + arr2 = [] + arr3 = [100, 0] + elif self.flavor == "numpy": + arr1 = np.array([1, 2, 3], dtype=atype) + arr2 = np.array([], dtype=atype) + arr3 = np.array([100, 0], dtype=atype) + + if self.flavor == "numpy": + self.assertTrue(common.allequal(row[0], arr1, self.flavor)) + self.assertTrue(common.allequal(row[1], arr2, self.flavor)) + self.assertTrue(common.allequal(row[2], arr3, self.flavor)) + else: + # Tuple or List flavors + self.assertEqual(row[0], arr1) + self.assertEqual(row[1], arr2) + self.assertEqual(row[2], arr3) + + def test04_FloatAtom(self): + """Checking vlarray with different flavors (floating point versions)""" + + ttypes = [ + "float32", + "float64", + "complex64", + "complex128", + ] + + for name in ("float16", "float96", "float128"): + atomname = name.capitalize() + "Atom" + if hasattr(tb, atomname): + ttypes.append(name) + + for itemsize in (192, 256): + atomname = "Complex%dAtom" % itemsize + if hasattr(tb, atomname): + ttypes.append("complex%d" % (itemsize)) + + root = self.rootgroup + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test04_FloatAtom..." % self.__class__.__name__) + + # Create an string atom + for atype in ttypes: + vlarray = self.h5file.create_vlarray(root, atype, + tb.Atom.from_sctype(atype)) + vlarray.flavor = self.flavor + vlarray.append([1.3, 2.2, 3.3]) + vlarray.append(()) + vlarray.append([-1.3e34, 1.e-32]) + + # Read all the rows: + row = vlarray.read() + if common.verbose: + print("Testing flavor:", self.flavor) + print("Object read:", row) + print("Nrows in", vlarray._v_pathname, ":", vlarray.nrows) + print("First row in vlarray ==>", row[0]) + + self.assertEqual(vlarray.nrows, 3) + self.assertEqual(len(row[0]), 3) + self.assertEqual(len(row[1]), 0) + self.assertEqual(len(row[2]), 2) + if self.flavor == "python": + arr1 = list(np.array([1.3, 2.2, 3.3], atype)) + arr2 = list(np.array([], atype)) + arr3 = list(np.array([-1.3e34, 1.e-32], atype)) + elif self.flavor == "numpy": + arr1 = np.array([1.3, 2.2, 3.3], dtype=atype) + arr2 = np.array([], dtype=atype) + arr3 = np.array([-1.3e34, 1.e-32], dtype=atype) + + if self.flavor == "numpy": + self.assertTrue(common.allequal(row[0], arr1, self.flavor)) + self.assertTrue(common.allequal(row[1], arr2, self.flavor)) + self.assertTrue(common.allequal(row[2], arr3, self.flavor)) + else: + # Tuple or List flavors + self.assertEqual(row[0], arr1) + self.assertEqual(row[1], arr2) + self.assertEqual(row[2], arr3) + + +class NumPyFlavorTestCase(FlavorTestCase): + flavor = "numpy" + + +class PythonFlavorTestCase(FlavorTestCase): + flavor = "python" + + +class ReadRangeTestCase(common.TempFileMixin, common.PyTablesTestCase): + nrows = 100 + mode = "w" + compress = 0 + complib = "zlib" # Default compression library + + def setUp(self): + super().setUp() + self.rootgroup = self.h5file.root + self.populateFile() + self._reopen() + + def populateFile(self): + group = self.rootgroup + filters = tb.Filters(complevel=self.compress, complib=self.complib) + vlarray = self.h5file.create_vlarray(group, 'vlarray', tb.Int32Atom(), + "ragged array if ints", + filters=filters, + expectedrows=1000) + + # Fill it with 100 rows with variable length + for i in range(self.nrows): + vlarray.append(list(range(i))) + + def test01_start(self): + """Checking reads with only a start value""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test01_start..." % self.__class__.__name__) + + vlarray = self.h5file.root.vlarray + + # Read some rows: + row = [] + row.append(vlarray.read(0)[0]) + row.append(vlarray.read(10)[0]) + row.append(vlarray.read(99)[0]) + if common.verbose: + print("Nrows in", vlarray._v_pathname, ":", vlarray.nrows) + print("Second row in vlarray ==>", row[1]) + + self.assertEqual(vlarray.nrows, self.nrows) + self.assertEqual(len(row[0]), 0) + self.assertEqual(len(row[1]), 10) + self.assertEqual(len(row[2]), 99) + self.assertTrue(common.allequal(row[0], np.arange(0, dtype='int32'))) + self.assertTrue(common.allequal(row[1], np.arange(10, dtype='int32'))) + self.assertTrue(common.allequal(row[2], np.arange(99, dtype='int32'))) + + def test01b_start(self): + """Checking reads with only a start value in a slice""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test01b_start..." % self.__class__.__name__) + + vlarray = self.h5file.root.vlarray + + # Read some rows: + row = [] + row.append(vlarray[0]) + row.append(vlarray[10]) + row.append(vlarray[99]) + if common.verbose: + print("Nrows in", vlarray._v_pathname, ":", vlarray.nrows) + print("Second row in vlarray ==>", row[1]) + + self.assertEqual(vlarray.nrows, self.nrows) + self.assertEqual(len(row[0]), 0) + self.assertEqual(len(row[1]), 10) + self.assertEqual(len(row[2]), 99) + self.assertTrue(common.allequal(row[0], np.arange(0, dtype='int32'))) + self.assertTrue(common.allequal(row[1], np.arange(10, dtype='int32'))) + self.assertTrue(common.allequal(row[2], np.arange(99, dtype='int32'))) + + def test01np_start(self): + """Checking reads with only a start value in a slice (numpy indexes)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test01np_start..." % self.__class__.__name__) + + vlarray = self.h5file.root.vlarray + + # Read some rows: + row = [] + row.append(vlarray[np.int8(0)]) + row.append(vlarray[np.int32(10)]) + row.append(vlarray[np.int64(99)]) + if common.verbose: + print("Nrows in", vlarray._v_pathname, ":", vlarray.nrows) + print("Second row in vlarray ==>", row[1]) + + self.assertEqual(vlarray.nrows, self.nrows) + self.assertEqual(len(row[0]), 0) + self.assertEqual(len(row[1]), 10) + self.assertEqual(len(row[2]), 99) + self.assertTrue(common.allequal(row[0], np.arange(0, dtype='int32'))) + self.assertTrue(common.allequal(row[1], np.arange(10, dtype='int32'))) + self.assertTrue(common.allequal(row[2], np.arange(99, dtype='int32'))) + + def test02_stop(self): + """Checking reads with only a stop value""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test02_stop..." % self.__class__.__name__) + + vlarray = self.h5file.root.vlarray + # Choose a small value for buffer size + vlarray._nrowsinbuf = 3 + + # Read some rows: + row = [] + row.append(vlarray.read(stop=1)) + row.append(vlarray.read(stop=10)) + row.append(vlarray.read(stop=99)) + if common.verbose: + print("Nrows in", vlarray._v_pathname, ":", vlarray.nrows) + print("First row in vlarray ==>", row[0]) + print("Second row in vlarray ==>", row[1]) + + self.assertEqual(vlarray.nrows, self.nrows) + self.assertEqual(len(row[0]), 1) + self.assertEqual(len(row[1]), 10) + self.assertEqual(len(row[2]), 99) + self.assertTrue(common.allequal( + row[0][0], np.arange(0, dtype='int32'))) + for x in range(10): + self.assertTrue(common.allequal( + row[1][x], np.arange(x, dtype='int32'))) + for x in range(99): + self.assertTrue(common.allequal( + row[2][x], np.arange(x, dtype='int32'))) + + def test02b_stop(self): + """Checking reads with only a stop value in a slice""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test02b_stop..." % self.__class__.__name__) + + vlarray = self.h5file.root.vlarray + + # Choose a small value for buffer size + vlarray._nrowsinbuf = 3 + + # Read some rows: + row = [] + row.append(vlarray[:1]) + row.append(vlarray[:10]) + row.append(vlarray[:99]) + if common.verbose: + print("Nrows in", vlarray._v_pathname, ":", vlarray.nrows) + print("Second row in vlarray ==>", row[1]) + + self.assertEqual(vlarray.nrows, self.nrows) + self.assertEqual(len(row[0]), 1) + self.assertEqual(len(row[1]), 10) + self.assertEqual(len(row[2]), 99) + for x in range(1): + self.assertTrue(common.allequal( + row[0][x], np.arange(0, dtype='int32'))) + for x in range(10): + self.assertTrue(common.allequal( + row[1][x], np.arange(x, dtype='int32'))) + for x in range(99): + self.assertTrue(common.allequal( + row[2][x], np.arange(x, dtype='int32'))) + + def test03_startstop(self): + """Checking reads with a start and stop values""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test03_startstop..." % self.__class__.__name__) + + vlarray = self.h5file.root.vlarray + + # Choose a small value for buffer size + vlarray._nrowsinbuf = 3 + + # Read some rows: + row = [] + row.append(vlarray.read(0, 10)) + row.append(vlarray.read(5, 15)) + row.append(vlarray.read(0, 100)) # read all the array + if common.verbose: + print("Nrows in", vlarray._v_pathname, ":", vlarray.nrows) + print("Second row in vlarray ==>", row[1]) + + self.assertEqual(vlarray.nrows, self.nrows) + self.assertEqual(len(row[0]), 10) + self.assertEqual(len(row[1]), 10) + self.assertEqual(len(row[2]), 100) + for x in range(0, 10): + self.assertTrue(common.allequal( + row[0][x], np.arange(x, dtype='int32'))) + for x in range(5, 15): + self.assertTrue(common.allequal( + row[1][x-5], np.arange(x, dtype='int32'))) + for x in range(0, 100): + self.assertTrue(common.allequal( + row[2][x], np.arange(x, dtype='int32'))) + + def test03b_startstop(self): + """Checking reads with a start and stop values in slices""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test03b_startstop..." % self.__class__.__name__) + + vlarray = self.h5file.root.vlarray + + # Choose a small value for buffer size + vlarray._nrowsinbuf = 3 + + # Read some rows: + row = [] + row.append(vlarray[0:10]) + row.append(vlarray[5:15]) + row.append(vlarray[:]) # read all the array + if common.verbose: + print("Nrows in", vlarray._v_pathname, ":", vlarray.nrows) + print("Second row in vlarray ==>", row[1]) + + self.assertEqual(vlarray.nrows, self.nrows) + self.assertEqual(len(row[0]), 10) + self.assertEqual(len(row[1]), 10) + self.assertEqual(len(row[2]), 100) + for x in range(0, 10): + self.assertTrue(common.allequal( + row[0][x], np.arange(x, dtype='int32'))) + for x in range(5, 15): + self.assertTrue(common.allequal( + row[1][x-5], np.arange(x, dtype='int32'))) + for x in range(0, 100): + self.assertTrue(common.allequal( + row[2][x], np.arange(x, dtype='int32'))) + + def test04_startstopstep(self): + """Checking reads with a start, stop & step values""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test04_startstopstep..." % + self.__class__.__name__) + + vlarray = self.h5file.root.vlarray + + # Choose a small value for buffer size + vlarray._nrowsinbuf = 3 + + # Read some rows: + row = [] + row.append(vlarray.read(0, 10, 2)) + row.append(vlarray.read(5, 15, 3)) + row.append(vlarray.read(0, 100, 20)) + if common.verbose: + print("Nrows in", vlarray._v_pathname, ":", vlarray.nrows) + print("Second row in vlarray ==>", row[1]) + + self.assertEqual(vlarray.nrows, self.nrows) + self.assertEqual(len(row[0]), 5) + self.assertEqual(len(row[1]), 4) + self.assertEqual(len(row[2]), 5) + for x in range(0, 10, 2): + self.assertTrue(common.allequal( + row[0][x // 2], np.arange(x, dtype='int32'))) + for x in range(5, 15, 3): + self.assertTrue(common.allequal( + row[1][(x - 5) // 3], np.arange(x, dtype='int32'))) + for x in range(0, 100, 20): + self.assertTrue(common.allequal( + row[2][x // 20], np.arange(x, dtype='int32'))) + + def test04np_startstopstep(self): + """Checking reads with a start, stop & step values (numpy indices)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test04np_startstopstep..." % + self.__class__.__name__) + + vlarray = self.h5file.root.vlarray + + # Choose a small value for buffer size + vlarray._nrowsinbuf = 3 + + # Read some rows: + row = [] + row.append(vlarray.read(np.int8(0), np.int8(10), np.int8(2))) + row.append(vlarray.read(np.int8(5), np.int8(15), np.int8(3))) + row.append(vlarray.read(np.int8(0), np.int8(100), np.int8(20))) + if common.verbose: + print("Nrows in", vlarray._v_pathname, ":", vlarray.nrows) + print("Second row in vlarray ==>", row[1]) + + self.assertEqual(vlarray.nrows, self.nrows) + self.assertEqual(len(row[0]), 5) + self.assertEqual(len(row[1]), 4) + self.assertEqual(len(row[2]), 5) + for x in range(0, 10, 2): + self.assertTrue(common.allequal( + row[0][x // 2], np.arange(x, dtype='int32'))) + for x in range(5, 15, 3): + self.assertTrue(common.allequal( + row[1][(x - 5) // 3], np.arange(x, dtype='int32'))) + for x in range(0, 100, 20): + self.assertTrue(common.allequal( + row[2][x // 20], np.arange(x, dtype='int32'))) + + def test04b_slices(self): + """Checking reads with start, stop & step values in slices""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test04b_slices..." % self.__class__.__name__) + + vlarray = self.h5file.root.vlarray + + # Choose a small value for buffer size + vlarray._nrowsinbuf = 3 + + # Read some rows: + row = [] + row.append(vlarray[0:10:2]) + row.append(vlarray[5:15:3]) + row.append(vlarray[0:100:20]) + if common.verbose: + print("Nrows in", vlarray._v_pathname, ":", vlarray.nrows) + print("Second row in vlarray ==>", row[1]) + + self.assertEqual(vlarray.nrows, self.nrows) + self.assertEqual(len(row[0]), 5) + self.assertEqual(len(row[1]), 4) + self.assertEqual(len(row[2]), 5) + for x in range(0, 10, 2): + self.assertTrue(common.allequal( + row[0][x // 2], np.arange(x, dtype='int32'))) + for x in range(5, 15, 3): + self.assertTrue(common.allequal( + row[1][(x - 5) // 3], np.arange(x, dtype='int32'))) + for x in range(0, 100, 20): + self.assertTrue(common.allequal( + row[2][x // 20], np.arange(x, dtype='int32'))) + + def test04bnp_slices(self): + """Checking reads with start, stop & step values in slices. + + (numpy indices) + + """ + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test04bnp_slices..." % self.__class__.__name__) + + vlarray = self.h5file.root.vlarray + + # Choose a small value for buffer size + vlarray._nrowsinbuf = 3 + + # Read some rows: + row = [] + row.append(vlarray[np.int16(0):np.int16(10):np.int32(2)]) + row.append(vlarray[np.int16(5):np.int16(15):np.int64(3)]) + row.append(vlarray[np.uint16(0):np.int32(100):np.int8(20)]) + if common.verbose: + print("Nrows in", vlarray._v_pathname, ":", vlarray.nrows) + print("Second row in vlarray ==>", row[1]) + + self.assertEqual(vlarray.nrows, self.nrows) + self.assertEqual(len(row[0]), 5) + self.assertEqual(len(row[1]), 4) + self.assertEqual(len(row[2]), 5) + for x in range(0, 10, 2): + self.assertTrue( + common.allequal(row[0][x//2], np.arange(x, dtype='int32'))) + for x in range(5, 15, 3): + self.assertTrue( + common.allequal(row[1][(x-5)//3], np.arange(x, dtype='int32'))) + for x in range(0, 100, 20): + self.assertTrue( + common.allequal(row[2][x//20], np.arange(x, dtype='int32'))) + + def test05_out_of_range(self): + """Checking out of range reads""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test05_out_of_range..." % + self.__class__.__name__) + + vlarray = self.h5file.root.vlarray + + if common.verbose: + print("Nrows in", vlarray._v_pathname, ":", vlarray.nrows) + + with self.assertRaises(IndexError): + row = vlarray.read(1000)[0] + print("row-->", row) + + +class GetItemRangeTestCase(common.TempFileMixin, common.PyTablesTestCase): + nrows = 100 + open_mode = "w" + compress = 0 + complib = "zlib" # Default compression library + + def setUp(self): + super().setUp() + + self.rootgroup = self.h5file.root + self.populateFile() + self._reopen() + + def populateFile(self): + group = self.rootgroup + filters = tb.Filters(complevel=self.compress, complib=self.complib) + vlarray = self.h5file.create_vlarray(group, 'vlarray', tb.Int32Atom(), + "ragged array if ints", + filters=filters, + expectedrows=1000) + + # Fill it with 100 rows with variable length + for i in range(self.nrows): + vlarray.append(list(range(i))) + + def test01_start(self): + """Checking reads with only a start value""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test01_start..." % self.__class__.__name__) + + vlarray = self.h5file.root.vlarray + + # Read some rows: + row = [] + row.append(vlarray[0]) + + # rank-0 array should work as a regular index (see #303) + row.append(vlarray[np.array(10)]) + row.append(vlarray[99]) + if common.verbose: + print("Nrows in", vlarray._v_pathname, ":", vlarray.nrows) + print("Second row in vlarray ==>", row[1]) + + self.assertEqual(vlarray.nrows, self.nrows) + self.assertEqual(len(row[0]), 0) + self.assertEqual(len(row[1]), 10) + self.assertEqual(len(row[2]), 99) + self.assertTrue(common.allequal(row[0], np.arange(0, dtype='int32'))) + self.assertTrue(common.allequal(row[1], np.arange(10, dtype='int32'))) + self.assertTrue(common.allequal(row[2], np.arange(99, dtype='int32'))) + + def test01b_start(self): + """Checking reads with only a start value in a slice""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test01b_start..." % self.__class__.__name__) + + vlarray = self.h5file.root.vlarray + + # Read some rows: + row = [] + row.append(vlarray[0]) + row.append(vlarray[10]) + row.append(vlarray[99]) + if common.verbose: + print("Nrows in", vlarray._v_pathname, ":", vlarray.nrows) + print("Second row in vlarray ==>", row[1]) + + self.assertEqual(vlarray.nrows, self.nrows) + self.assertEqual(len(row[0]), 0) + self.assertEqual(len(row[1]), 10) + self.assertEqual(len(row[2]), 99) + self.assertTrue(common.allequal(row[0], np.arange(0, dtype='int32'))) + self.assertTrue(common.allequal(row[1], np.arange(10, dtype='int32'))) + self.assertTrue(common.allequal(row[2], np.arange(99, dtype='int32'))) + + def test02_stop(self): + """Checking reads with only a stop value""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test02_stop..." % self.__class__.__name__) + + vlarray = self.h5file.root.vlarray + + # Choose a small value for buffer size + vlarray._nrowsinbuf = 3 + + # Read some rows: + row = [] + row.append(vlarray[:1]) + row.append(vlarray[:10]) + row.append(vlarray[:99]) + if common.verbose: + print("Nrows in", vlarray._v_pathname, ":", vlarray.nrows) + print("First row in vlarray ==>", row[0]) + print("Second row in vlarray ==>", row[1]) + + self.assertEqual(vlarray.nrows, self.nrows) + self.assertEqual(len(row[0]), 1) + self.assertEqual(len(row[1]), 10) + self.assertEqual(len(row[2]), 99) + self.assertTrue(common.allequal( + row[0][0], np.arange(0, dtype='int32'))) + for x in range(10): + self.assertTrue(common.allequal( + row[1][x], np.arange(x, dtype='int32'))) + for x in range(99): + self.assertTrue(common.allequal( + row[2][x], np.arange(x, dtype='int32'))) + + def test02b_stop(self): + """Checking reads with only a stop value in a slice""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test02b_stop..." % self.__class__.__name__) + + vlarray = self.h5file.root.vlarray + + # Choose a small value for buffer size + vlarray._nrowsinbuf = 3 + + # Read some rows: + row = [] + row.append(vlarray[:1]) + row.append(vlarray[:10]) + row.append(vlarray[:99]) + if common.verbose: + print("Nrows in", vlarray._v_pathname, ":", vlarray.nrows) + print("Second row in vlarray ==>", row[1]) + + self.assertEqual(vlarray.nrows, self.nrows) + self.assertEqual(len(row[0]), 1) + self.assertEqual(len(row[1]), 10) + self.assertEqual(len(row[2]), 99) + for x in range(1): + self.assertTrue(common.allequal( + row[0][x], np.arange(0, dtype='int32'))) + for x in range(10): + self.assertTrue(common.allequal( + row[1][x], np.arange(x, dtype='int32'))) + for x in range(99): + self.assertTrue(common.allequal( + row[2][x], np.arange(x, dtype='int32'))) + + def test03_startstop(self): + """Checking reads with a start and stop values""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test03_startstop..." % self.__class__.__name__) + + vlarray = self.h5file.root.vlarray + + # Choose a small value for buffer size + vlarray._nrowsinbuf = 3 + + # Read some rows: + row = [] + row.append(vlarray[0:10]) + row.append(vlarray[5:15]) + row.append(vlarray[0:100]) # read all the array + if common.verbose: + print("Nrows in", vlarray._v_pathname, ":", vlarray.nrows) + print("Second row in vlarray ==>", row[1]) + + self.assertEqual(vlarray.nrows, self.nrows) + self.assertEqual(len(row[0]), 10) + self.assertEqual(len(row[1]), 10) + self.assertEqual(len(row[2]), 100) + for x in range(0, 10): + self.assertTrue(common.allequal( + row[0][x], np.arange(x, dtype='int32'))) + for x in range(5, 15): + self.assertTrue(common.allequal( + row[1][x-5], np.arange(x, dtype='int32'))) + for x in range(0, 100): + self.assertTrue(common.allequal( + row[2][x], np.arange(x, dtype='int32'))) + + def test03b_startstop(self): + """Checking reads with a start and stop values in slices""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test03b_startstop..." % self.__class__.__name__) + + vlarray = self.h5file.root.vlarray + + # Choose a small value for buffer size + vlarray._nrowsinbuf = 3 + + # Read some rows: + row = [] + row.append(vlarray[0:10]) + row.append(vlarray[5:15]) + row.append(vlarray[:]) # read all the array + if common.verbose: + print("Nrows in", vlarray._v_pathname, ":", vlarray.nrows) + print("Second row in vlarray ==>", row[1]) + + self.assertEqual(vlarray.nrows, self.nrows) + self.assertEqual(len(row[0]), 10) + self.assertEqual(len(row[1]), 10) + self.assertEqual(len(row[2]), 100) + for x in range(0, 10): + self.assertTrue(common.allequal( + row[0][x], np.arange(x, dtype='int32'))) + for x in range(5, 15): + self.assertTrue(common.allequal( + row[1][x-5], np.arange(x, dtype='int32'))) + for x in range(0, 100): + self.assertTrue(common.allequal( + row[2][x], np.arange(x, dtype='int32'))) + + def test04_slices(self): + """Checking reads with a start, stop & step values""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test04_slices..." % self.__class__.__name__) + + vlarray = self.h5file.root.vlarray + + # Choose a small value for buffer size + vlarray._nrowsinbuf = 3 + + # Read some rows: + row = [] + row.append(vlarray[0:10:2]) + row.append(vlarray[5:15:3]) + row.append(vlarray[0:100:20]) + if common.verbose: + print("Nrows in", vlarray._v_pathname, ":", vlarray.nrows) + print("Second row in vlarray ==>", row[1]) + + self.assertEqual(vlarray.nrows, self.nrows) + self.assertEqual(len(row[0]), 5) + self.assertEqual(len(row[1]), 4) + self.assertEqual(len(row[2]), 5) + for x in range(0, 10, 2): + self.assertTrue( + common.allequal(row[0][x//2], np.arange(x, dtype='int32'))) + for x in range(5, 15, 3): + self.assertTrue( + common.allequal(row[1][(x-5)//3], np.arange(x, dtype='int32'))) + for x in range(0, 100, 20): + self.assertTrue( + common.allequal(row[2][x//20], np.arange(x, dtype='int32'))) + + def test04bnp_slices(self): + """Checking reads with start, stop & step values (numpy indices)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test04np_slices..." % self.__class__.__name__) + + vlarray = self.h5file.root.vlarray + + # Choose a small value for buffer size + vlarray._nrowsinbuf = 3 + + # Read some rows: + row = [] + row.append(vlarray[np.int8(0):np.int8(10):np.int8(2)]) + row.append(vlarray[np.int8(5):np.int8(15):np.int8(3)]) + row.append(vlarray[np.int8(0):np.int8(100):np.int8(20)]) + if common.verbose: + print("Nrows in", vlarray._v_pathname, ":", vlarray.nrows) + print("Second row in vlarray ==>", row[1]) + + self.assertEqual(vlarray.nrows, self.nrows) + self.assertEqual(len(row[0]), 5) + self.assertEqual(len(row[1]), 4) + self.assertEqual(len(row[2]), 5) + for x in range(0, 10, 2): + self.assertTrue( + common.allequal(row[0][x//2], np.arange(x, dtype='int32'))) + for x in range(5, 15, 3): + self.assertTrue( + common.allequal(row[1][(x-5)//3], np.arange(x, dtype='int32'))) + for x in range(0, 100, 20): + self.assertTrue( + common.allequal(row[2][x//20], np.arange(x, dtype='int32'))) + + def test05_out_of_range(self): + """Checking out of range reads""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test05_out_of_range..." % + self.__class__.__name__) + + vlarray = self.h5file.root.vlarray + + if common.verbose: + print("Nrows in", vlarray._v_pathname, ":", vlarray.nrows) + + with self.assertRaises(IndexError): + row = vlarray[1000] + print("row-->", row) + + def test05np_out_of_range(self): + """Checking out of range reads (numpy indexes)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test05np_out_of_range..." % + self.__class__.__name__) + + vlarray = self.h5file.root.vlarray + + if common.verbose: + print("Nrows in", vlarray._v_pathname, ":", vlarray.nrows) + + with self.assertRaises(IndexError): + row = vlarray[np.int32(1000)] + print("row-->", row) + + +class SetRangeTestCase(common.TempFileMixin, common.PyTablesTestCase): + nrows = 100 + open_mode = "w" + compress = 0 + complib = "zlib" # Default compression library + + def setUp(self): + super().setUp() + self.rootgroup = self.h5file.root + self.populateFile() + self._reopen(mode='a') + + def populateFile(self): + group = self.rootgroup + filters = tb.Filters(complevel=self.compress, complib=self.complib) + vlarray = self.h5file.create_vlarray(group, 'vlarray', tb.Int32Atom(), + "ragged array if ints", + filters=filters, + expectedrows=1000) + + # Fill it with 100 rows with variable length + for i in range(self.nrows): + vlarray.append(list(range(i))) + + def test01_start(self): + """Checking updates that modifies a complete row""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test01_start..." % self.__class__.__name__) + + vlarray = self.h5file.root.vlarray + + # Modify some rows: + vlarray[0] = vlarray[0]*2 + 3 + vlarray[10] = vlarray[10]*2 + 3 + vlarray[99] = vlarray[99]*2 + 3 + + # Read some rows: + row = [] + row.append(vlarray.read(0)[0]) + row.append(vlarray.read(10)[0]) + row.append(vlarray.read(99)[0]) + if common.verbose: + print("Nrows in", vlarray._v_pathname, ":", vlarray.nrows) + print("Second row in vlarray ==>", row[1]) + + self.assertEqual(vlarray.nrows, self.nrows) + self.assertEqual(len(row[0]), 0) + self.assertEqual(len(row[1]), 10) + self.assertEqual(len(row[2]), 99) + self.assertTrue(common.allequal( + row[0], np.arange(0, dtype='int32') * 2 + 3)) + self.assertTrue(common.allequal( + row[1], np.arange(10, dtype='int32') * 2 + 3)) + self.assertTrue(common.allequal( + row[2], np.arange(99, dtype='int32') * 2 + 3)) + + def test01np_start(self): + """Checking updates that modifies a complete row""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test01np_start..." % self.__class__.__name__) + + vlarray = self.h5file.root.vlarray + + # Modify some rows: + vlarray[np.int8(0)] = vlarray[np.int16(0)]*2 + 3 + vlarray[np.int8(10)] = vlarray[np.int8(10)]*2 + 3 + vlarray[np.int32(99)] = vlarray[np.int64(99)]*2 + 3 + + # Read some rows: + row = [] + row.append(vlarray.read(np.int8(0))[0]) + row.append(vlarray.read(np.int8(10))[0]) + row.append(vlarray.read(np.int8(99))[0]) + if common.verbose: + print("Nrows in", vlarray._v_pathname, ":", vlarray.nrows) + print("Second row in vlarray ==>", row[1]) + + self.assertEqual(vlarray.nrows, self.nrows) + self.assertEqual(len(row[0]), 0) + self.assertEqual(len(row[1]), 10) + self.assertEqual(len(row[2]), 99) + self.assertTrue(common.allequal( + row[0], np.arange(0, dtype='int32') * 2 + 3)) + self.assertTrue(common.allequal( + row[1], np.arange(10, dtype='int32') * 2 + 3)) + self.assertTrue(common.allequal( + row[2], np.arange(99, dtype='int32') * 2 + 3)) + + def test02_partial(self): + """Checking updates with only a part of a row""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test02_partial..." % self.__class__.__name__) + + vlarray = self.h5file.root.vlarray + + # Modify some rows: + vlarray[0] = vlarray[0]*2 + 3 + vlarray[10] = vlarray[10]*2 + 3 + vlarray[96] = vlarray[99][3:]*2 + 3 + + # Read some rows: + row = [] + row.append(vlarray.read(0)[0]) + row.append(vlarray.read(10)[0]) + row.append(vlarray.read(96)[0]) + if common.verbose: + print("Nrows in", vlarray._v_pathname, ":", vlarray.nrows) + print("Second row in vlarray ==>", row[1]) + + self.assertEqual(vlarray.nrows, self.nrows) + self.assertEqual(len(row[0]), 0) + self.assertEqual(len(row[1]), 10) + self.assertEqual(len(row[2]), 96) + self.assertTrue(common.allequal( + row[0], np.arange(0, dtype='int32') * 2 + 3)) + self.assertTrue(common.allequal( + row[1], np.arange(10, dtype='int32') * 2 + 3)) + a = np.arange(3, 99, dtype='int32') + a = a * 2 + 3 + self.assertTrue(common.allequal(row[2], a)) + + def test03a_several_rows(self): + """Checking updating several rows at once (slice style)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test03a_several_rows..." % + self.__class__.__name__) + + vlarray = self.h5file.root.vlarray + + # Modify some rows: + vlarray[3:6] = (vlarray[3]*2 + 3, + vlarray[4]*2 + 3, + vlarray[5]*2 + 3) + + # Read some rows: + row = [] + row.append(vlarray.read(3)[0]) + row.append(vlarray.read(4)[0]) + row.append(vlarray.read(5)[0]) + if common.verbose: + print("Nrows in", vlarray._v_pathname, ":", vlarray.nrows) + print("Second row in vlarray ==>", row[1]) + + self.assertEqual(vlarray.nrows, self.nrows) + self.assertEqual(len(row[0]), 3) + self.assertEqual(len(row[1]), 4) + self.assertEqual(len(row[2]), 5) + self.assertTrue(common.allequal( + row[0], np.arange(3, dtype='int32') * 2 + 3)) + self.assertTrue(common.allequal( + row[1], np.arange(4, dtype='int32') * 2 + 3)) + self.assertTrue(common.allequal( + row[2], np.arange(5, dtype='int32') * 2 + 3)) + + def test03b_several_rows(self): + """Checking updating several rows at once (list style)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test03b_several_rows..." % + self.__class__.__name__) + + vlarray = self.h5file.root.vlarray + + # Modify some rows: + vlarray[[0, 10, 96]] = (vlarray[0]*2 + 3, + vlarray[10]*2 + 3, + vlarray[96]*2 + 3) + + # Read some rows: + row = [] + row.append(vlarray.read(0)[0]) + row.append(vlarray.read(10)[0]) + row.append(vlarray.read(96)[0]) + if common.verbose: + print("Nrows in", vlarray._v_pathname, ":", vlarray.nrows) + print("Second row in vlarray ==>", row[1]) + + self.assertEqual(vlarray.nrows, self.nrows) + self.assertEqual(len(row[0]), 0) + self.assertEqual(len(row[1]), 10) + self.assertEqual(len(row[2]), 96) + self.assertTrue(common.allequal( + row[0], np.arange(0, dtype='int32') * 2 + 3)) + self.assertTrue(common.allequal( + row[1], np.arange(10, dtype='int32') * 2 + 3)) + self.assertTrue(common.allequal( + row[2], np.arange(96, dtype='int32') * 2 + 3)) + + def test03c_several_rows(self): + """Checking updating several rows at once (NumPy's where style)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test03c_several_rows..." % + self.__class__.__name__) + + vlarray = self.h5file.root.vlarray + + # Modify some rows: + vlarray[(np.array([0, 10, 96]),)] = (vlarray[0] * 2 + 3, + vlarray[10] * 2 + 3, + vlarray[96] * 2 + 3) + + # Read some rows: + row = [] + row.append(vlarray.read(0)[0]) + row.append(vlarray.read(10)[0]) + row.append(vlarray.read(96)[0]) + if common.verbose: + print("Nrows in", vlarray._v_pathname, ":", vlarray.nrows) + print("Second row in vlarray ==>", row[1]) + + self.assertEqual(vlarray.nrows, self.nrows) + self.assertEqual(len(row[0]), 0) + self.assertEqual(len(row[1]), 10) + self.assertEqual(len(row[2]), 96) + self.assertTrue(common.allequal( + row[0], np.arange(0, dtype='int32') * 2 + 3)) + self.assertTrue(common.allequal( + row[1], np.arange(10, dtype='int32') * 2 + 3)) + self.assertTrue(common.allequal( + row[2], np.arange(96, dtype='int32') * 2 + 3)) + + def test04_out_of_range(self): + """Checking out of range updates (first index)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test04_out_of_range..." % + self.__class__.__name__) + + vlarray = self.h5file.root.vlarray + + if common.verbose: + print("Nrows in", vlarray._v_pathname, ":", vlarray.nrows) + + with self.assertRaises(IndexError): + vlarray[1000] = [1] + + def test05_value_error(self): + """Checking out value errors""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test05_value_error..." % self.__class__.__name__) + + vlarray = self.h5file.root.vlarray + + if common.verbose: + print("Nrows in", vlarray._v_pathname, ":", vlarray.nrows) + + with self.assertRaises(ValueError): + vlarray[10] = [1]*100 + + +class CopyTestCase(common.TempFileMixin, common.PyTablesTestCase): + close = True + + def test01a_copy(self): + """Checking VLArray.copy() method.""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test01a_copy..." % self.__class__.__name__) + + # Create an Vlarray + arr = tb.Int16Atom(shape=2) + array1 = self.h5file.create_vlarray( + self.h5file.root, 'array1', arr, "title array1") + array1.flavor = "python" + array1.append([[2, 3]]) + array1.append(()) # an empty row + array1.append([[3, 457], [2, 4]]) + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen(mode='a') + array1 = self.h5file.root.array1 + + # Copy it to another location + array2 = array1.copy('/', 'array2') + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen() + array1 = self.h5file.root.array1 + array2 = self.h5file.root.array2 + + if common.verbose: + print("array1-->", repr(array1)) + print("array2-->", repr(array2)) + print("array1[:]-->", repr(array1.read())) + print("array2[:]-->", repr(array2.read())) + print("attrs array1-->", repr(array1.attrs)) + print("attrs array2-->", repr(array2.attrs)) + + # Check that all the elements are equal + self.assertEqual(array1.read(), array2.read()) + + # Assert other properties in array + self.assertEqual(array1.nrows, array2.nrows) + self.assertEqual(array1.shape, array2.shape) + self.assertEqual(array1.flavor, array2.flavor) + self.assertEqual(array1.atom.dtype, array2.atom.dtype) + self.assertEqual(repr(array1.atom), repr(array2.atom)) + + self.assertEqual(array1.title, array2.title) + + def test01b_copy(self): + """Checking VLArray.copy() method (Pseudo-atom case)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test01b_copy..." % self.__class__.__name__) + + # Create an Vlarray + arr = tb.VLStringAtom() + array1 = self.h5file.create_vlarray( + self.h5file.root, 'array1', arr, "title array1") + array1.flavor = "python" + array1.append(b"a string") + array1.append(b"") # an empty row + array1.append(b"another string") + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen(mode='a') + array1 = self.h5file.root.array1 + + # Copy it to another location + array2 = array1.copy('/', 'array2') + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen() + array1 = self.h5file.root.array1 + array2 = self.h5file.root.array2 + + if common.verbose: + print("array1-->", repr(array1)) + print("array2-->", repr(array2)) + print("array1[:]-->", repr(array1.read())) + print("array2[:]-->", repr(array2.read())) + print("attrs array1-->", repr(array1.attrs)) + print("attrs array2-->", repr(array2.attrs)) + + # Check that all the elements are equal + self.assertEqual(array1.read(), array2.read()) + + # Assert other properties in array + self.assertEqual(array1.nrows, array2.nrows) + self.assertEqual(array1.shape, array2.shape) + self.assertEqual(array1.flavor, array2.flavor) + self.assertEqual(array1.atom.type, array2.atom.type) + self.assertEqual(repr(array1.atom), repr(array2.atom)) + + self.assertEqual(array1.title, array2.title) + + def test02_copy(self): + """Checking VLArray.copy() method (where specified)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test02_copy..." % self.__class__.__name__) + + # Create an VLArray + arr = tb.Int16Atom(shape=2) + array1 = self.h5file.create_vlarray( + self.h5file.root, 'array1', arr, "title array1") + array1.flavor = "python" + array1.append([[2, 3]]) + array1.append(()) # an empty row + array1.append([[3, 457], [2, 4]]) + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen(mode='a') + array1 = self.h5file.root.array1 + + # Copy to another location + group1 = self.h5file.create_group("/", "group1") + array2 = array1.copy(group1, 'array2') + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen() + array1 = self.h5file.root.array1 + array2 = self.h5file.root.group1.array2 + + if common.verbose: + print("array1-->", repr(array1)) + print("array2-->", repr(array2)) + print("array1-->", array1.read()) + print("array2-->", array2.read()) + print("attrs array1-->", repr(array1.attrs)) + print("attrs array2-->", repr(array2.attrs)) + + # Check that all the elements are equal + self.assertEqual(array1.read(), array2.read()) + + # Assert other properties in array + self.assertEqual(array1.nrows, array2.nrows) + self.assertEqual(array1.shape, array2.shape) + self.assertEqual(array1.flavor, array2.flavor) + self.assertEqual(array1.atom.dtype, array2.atom.dtype) + self.assertEqual(repr(array1.atom), repr(array1.atom)) + self.assertEqual(array1.title, array2.title) + + def test03_copy(self): + """Checking VLArray.copy() method ('python' flavor)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test03_copy..." % self.__class__.__name__) + + # Create an VLArray + atom = tb.Int16Atom(shape=2) + array1 = self.h5file.create_vlarray( + self.h5file.root, 'array1', atom, title="title array1") + array1.flavor = "python" + array1.append(((2, 3),)) + array1.append(()) # an empty row + array1.append(((3, 457), (2, 4))) + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen(mode='a') + array1 = self.h5file.root.array1 + + # Copy to another location + array2 = array1.copy('/', 'array2') + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen() + array1 = self.h5file.root.array1 + array2 = self.h5file.root.array2 + + if common.verbose: + print("attrs array1-->", repr(array1.attrs)) + print("attrs array2-->", repr(array2.attrs)) + + # Assert other properties in array + self.assertEqual(array1.nrows, array2.nrows) + self.assertEqual(array1.shape, array2.shape) + self.assertEqual(array1.flavor, array2.flavor) # Very important here + self.assertEqual(array1.atom.dtype, array2.atom.dtype) + self.assertEqual(repr(array1.atom), repr(array1.atom)) + self.assertEqual(array1.title, array2.title) + + def test04_copy(self): + """Checking VLArray.copy() method (checking title copying)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test04_copy..." % self.__class__.__name__) + + # Create an VLArray + atom = tb.Int16Atom(shape=2) + array1 = self.h5file.create_vlarray( + self.h5file.root, 'array1', atom=atom, title="title array1") + array1.append(((2, 3),)) + array1.append(()) # an empty row + array1.append(((3, 457), (2, 4))) + + # Append some user attrs + array1.attrs.attr1 = "attr1" + array1.attrs.attr2 = 2 + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen(mode='a') + array1 = self.h5file.root.array1 + + # Copy it to another Array + array2 = array1.copy('/', 'array2', title="title array2") + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen() + array1 = self.h5file.root.array1 + array2 = self.h5file.root.array2 + + # Assert user attributes + if common.verbose: + print("title of destination array-->", array2.title) + self.assertEqual(array2.title, "title array2") + + def test05_copy(self): + """Checking VLArray.copy() method (user attributes copied)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test05_copy..." % self.__class__.__name__) + + # Create an Array + atom = tb.Int16Atom(shape=2) + array1 = self.h5file.create_vlarray( + self.h5file.root, 'array1', atom=atom, title="title array1") + array1.append(((2, 3),)) + array1.append(()) # an empty row + array1.append(((3, 457), (2, 4))) + + # Append some user attrs + array1.attrs.attr1 = "attr1" + array1.attrs.attr2 = 2 + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen(mode='a') + array1 = self.h5file.root.array1 + + # Copy it to another Array + array2 = array1.copy('/', 'array2', copyuserattrs=1) + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen() + array1 = self.h5file.root.array1 + array2 = self.h5file.root.array2 + + if common.verbose: + print("attrs array1-->", repr(array1.attrs)) + print("attrs array2-->", repr(array2.attrs)) + + # Assert user attributes + self.assertEqual(array2.attrs.attr1, "attr1") + self.assertEqual(array2.attrs.attr2, 2) + + def notest05b_copy(self): + """Checking VLArray.copy() method (user attributes not copied)""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test05b_copy..." % self.__class__.__name__) + + # Create an VLArray + atom = tb.Int16Atom(shape=2) + array1 = self.h5file.create_vlarray( + self.h5file.root, 'array1', atom=atom, title="title array1") + array1.append(((2, 3),)) + array1.append(()) # an empty row + array1.append(((3, 457), (2, 4))) + + # Append some user attrs + array1.attrs.attr1 = "attr1" + array1.attrs.attr2 = 2 + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen(mode='a') + array1 = self.h5file.root.array1 + + # Copy it to another Array + array2 = array1.copy('/', 'array2', copyuserattrs=0) + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen() + array1 = self.h5file.root.array1 + array2 = self.h5file.root.array2 + + if common.verbose: + print("attrs array1-->", repr(array1.attrs)) + print("attrs array2-->", repr(array2.attrs)) + + # Assert user attributes + self.assertEqual(array2.attrs.attr1, None) + self.assertEqual(array2.attrs.attr2, None) + + +class CloseCopyTestCase(CopyTestCase): + close = 1 + + +class OpenCopyTestCase(CopyTestCase): + close = 0 + + +class CopyIndexTestCase(common.TempFileMixin, common.PyTablesTestCase): + + def test01_index(self): + """Checking VLArray.copy() method with indexes.""" + + if common.verbose: + print('\n', '-=' * 30) + print("Running %s.test01_index..." % self.__class__.__name__) + + # Create an VLArray + atom = tb.Int32Atom(shape=(2,)) + array1 = self.h5file.create_vlarray( + self.h5file.root, 'array1', atom, "t array1") + array1.flavor = "python" + + # The next creates 20 rows of variable length + r = [] + for row in range(20): + r.append([[row, row + 1]]) + array1.append([row, row + 1]) + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen(mode='a') + array1 = self.h5file.root.array1 + + # Copy to another array + array2 = array1.copy("/", 'array2', + start=self.start, + stop=self.stop, + step=self.step) + + r2 = r[self.start:self.stop:self.step] + if common.verbose: + print("r2-->", r2) + print("array2-->", array2[:]) + print("attrs array1-->", repr(array1.attrs)) + print("attrs array2-->", repr(array2.attrs)) + print("nrows in array2-->", array2.nrows) + print("and it should be-->", len(r2)) + + # Check that all the elements are equal + self.assertEqual(r2, array2[:]) + + # Assert the number of rows in array + self.assertEqual(len(r2), array2.nrows) + + +class CopyIndex1TestCase(CopyIndexTestCase): + close = 0 + start = 0 + stop = 7 + step = 1 + + +class CopyIndex2TestCase(CopyIndexTestCase): + close = 1 + start = 0 + stop = -1 + step = 1 + + +class CopyIndex3TestCase(CopyIndexTestCase): + close = 0 + start = 1 + stop = 7 + step = 1 + + +class CopyIndex4TestCase(CopyIndexTestCase): + close = 1 + start = 0 + stop = 6 + step = 1 + + +class CopyIndex5TestCase(CopyIndexTestCase): + close = 0 + start = 3 + stop = 7 + step = 1 + + +class CopyIndex6TestCase(CopyIndexTestCase): + close = 1 + start = 3 + stop = 6 + step = 2 + + +class CopyIndex7TestCase(CopyIndexTestCase): + close = 0 + start = 0 + stop = 7 + step = 10 + + +class CopyIndex8TestCase(CopyIndexTestCase): + close = 1 + start = 6 + stop = -1 # Negative values means starting from the end + step = 1 + + +class CopyIndex9TestCase(CopyIndexTestCase): + close = 0 + start = 3 + stop = 4 + step = 1 + + +class CopyIndex10TestCase(CopyIndexTestCase): + close = 1 + start = 3 + stop = 4 + step = 2 + + +class CopyIndex11TestCase(CopyIndexTestCase): + close = 0 + start = -3 + stop = -1 + step = 2 + + +class CopyIndex12TestCase(CopyIndexTestCase): + close = 1 + start = -1 # Should point to the last element + stop = None # None should mean the last element (including it) + step = 1 + + +class ChunkshapeTestCase(common.TempFileMixin, common.PyTablesTestCase): + + def setUp(self): + super().setUp() + atom = tb.Int32Atom(shape=(2,)) + self.h5file.create_vlarray('/', 'vlarray', atom=atom, + title="t array1", + chunkshape=13) + + def test00(self): + """Test setting the chunkshape in a table (no reopen).""" + + vla = self.h5file.root.vlarray + if common.verbose: + print("chunkshape-->", vla.chunkshape) + self.assertEqual(vla.chunkshape, (13,)) + + def test01(self): + """Test setting the chunkshape in a table (reopen).""" + + self.h5file.close() + self.h5file = tb.open_file(self.h5fname, 'r') + vla = self.h5file.root.vlarray + if common.verbose: + print("chunkshape-->", vla.chunkshape) + self.assertEqual(vla.chunkshape, (13,)) + + +class VLUEndianTestCase(common.PyTablesTestCase): + def setUp(self): + super().setUp() + self.h5fname = common.test_filename('vlunicode_endian.h5') + self.h5file = tb.open_file(self.h5fname) + + def tearDown(self): + self.h5file.close() + super().tearDown() + + def test(self): + """Accessing ``vlunicode`` data of a different endianness.""" + + bedata = self.h5file.root.vlunicode_big[0] + ledata = self.h5file.root.vlunicode_little[0] + self.assertEqual(bedata, 'para\u0140lel') + self.assertEqual(ledata, 'para\u0140lel') + + +class TruncateTestCase(common.TempFileMixin, common.PyTablesTestCase): + + def setUp(self): + super().setUp() + + # Create an VLArray + arr = tb.Int16Atom(dflt=3) + array1 = self.h5file.create_vlarray( + self.h5file.root, 'array1', arr, "title array1") + + # Add a couple of rows + array1.append(np.array([456, 2], dtype='int16')) + array1.append(np.array([3], dtype='int16')) + + def test00_truncate(self): + """Checking VLArray.truncate() method (truncating to 0 rows)""" + + array1 = self.h5file.root.array1 + # Truncate to 0 elements + array1.truncate(0) + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen() + array1 = self.h5file.root.array1 + + if common.verbose: + print("array1-->", array1.read()) + + self.assertEqual(array1.nrows, 0) + self.assertEqual(array1[:], []) + + def test01_truncate(self): + """Checking VLArray.truncate() method (truncating to 1 rows)""" + + array1 = self.h5file.root.array1 + # Truncate to 1 element + array1.truncate(1) + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen() + array1 = self.h5file.root.array1 + + if common.verbose: + print("array1-->", array1.read()) + + self.assertEqual(array1.nrows, 1) + self.assertTrue(common.allequal( + array1[0], np.array([456, 2], dtype='int16'))) + + def test02_truncate(self): + """Checking VLArray.truncate() method (truncating to == self.nrows)""" + + array1 = self.h5file.root.array1 + # Truncate to 2 elements + array1.truncate(2) + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen() + array1 = self.h5file.root.array1 + + if common.verbose: + print("array1-->", array1.read()) + + self.assertEqual(array1.nrows, 2) + self.assertTrue( + common.allequal(array1[0], np.array([456, 2], dtype='int16'))) + self.assertTrue(common.allequal( + array1[1], np.array([3], dtype='int16'))) + + def test03_truncate(self): + """Checking VLArray.truncate() method (truncating to > self.nrows)""" + + array1 = self.h5file.root.array1 + # Truncate to 4 elements + array1.truncate(4) + + if self.close: + if common.verbose: + print("(closing file version)") + self._reopen() + array1 = self.h5file.root.array1 + + if common.verbose: + print("array1-->", array1.read()) + + self.assertEqual(array1.nrows, 4) + + # Check the original values + self.assertTrue( + common.allequal(array1[0], np.array([456, 2], dtype='int16'))) + self.assertTrue(common.allequal( + array1[1], np.array([3], dtype='int16'))) + + # Check that the added rows are empty + self.assertTrue(common.allequal( + array1[2], np.array([], dtype='int16'))) + self.assertTrue(common.allequal( + array1[3], np.array([], dtype='int16'))) + + +class TruncateOpenTestCase(TruncateTestCase): + close = 0 + + +class TruncateCloseTestCase(TruncateTestCase): + close = 1 + + +class PointSelectionTestCase(common.TempFileMixin, common.PyTablesTestCase): + + def setUp(self): + super().setUp() + + # The next are valid selections for both NumPy and PyTables + self.working_keyset = [ + [], # empty list + [2], # single-entry list + [0, 2], # list + [0, -2], # negative values + ([0, 2],), # tuple of list + np.array([], dtype="i4"), # empty array + np.array([1], dtype="i4"), # single-entry array + np.array([True, False, True]), # array of bools + ] + + # The next are invalid selections for VLArrays + self.not_working_keyset = [ + [1, 2, 100], # coordinate 100 > len(vlarray) + ([True, False, True],), # tuple of bools + ] + + # Create a sample array + arr1 = np.array([5, 6], dtype="i4") + arr2 = np.array([5, 6, 7], dtype="i4") + arr3 = np.array([5, 6, 9, 8], dtype="i4") + self.nparr = np.array([arr1, arr2, arr3], dtype="object") + + # Create the VLArray + self.vlarr = self.h5file.create_vlarray( + self.h5file.root, 'vlarray', tb.Int32Atom()) + self.vlarr.append(arr1) + self.vlarr.append(arr2) + self.vlarr.append(arr3) + + def test01a_read(self): + """Test for point-selections (read, boolean keys).""" + + nparr = self.nparr + vlarr = self.vlarr + for key in self.working_keyset: + if common.verbose: + print("Selection to test:", repr(key)) + a = nparr[key].tolist() + b = vlarr[key] + # if common.verbose: + # print "NumPy selection:", a, type(a) + # print "PyTables selection:", b, type(b) + self.assertEqual( + repr(a), repr(b), + "NumPy array and PyTables selections does not match.") + + def test01b_read(self): + """Test for point-selections (not working selections, read).""" + + vlarr = self.vlarr + for key in self.not_working_keyset: + if common.verbose: + print("Selection to test:", key) + self.assertRaises(IndexError, vlarr.__getitem__, key) + + +class SizeInMemoryPropertyTestCase(common.TempFileMixin, + common.PyTablesTestCase): + def create_array(self, atom, complevel): + filters = tb.Filters(complevel=complevel, complib='blosc') + self.array = self.h5file.create_vlarray('/', 'vlarray', atom=atom, + filters=filters) + + def test_zero_length(self): + atom = tb.Int32Atom() + complevel = 0 + self.create_array(atom, complevel) + self.assertEqual(self.array.size_in_memory, 0) + + def int_tests(self, complevel, flavor): + atom = tb.Int32Atom() + self.create_array(atom, complevel) + self.array.flavor = flavor + expected_size = 0 + for i in range(10): + row = np.arange((i + 1) * 10, dtype='i4') + self.array.append(row) + expected_size += row.nbytes + return expected_size + + def test_numpy_int_numpy_flavor(self): + complevel = 0 + flavor = 'numpy' + expected_size = self.int_tests(complevel, flavor) + self.assertEqual(self.array.size_in_memory, expected_size) + + # compression will have no effect, since this is uncompressed size + def test_numpy_int_numpy_flavor_compressed(self): + complevel = 1 + flavor = 'numpy' + expected_size = self.int_tests(complevel, flavor) + self.assertEqual(self.array.size_in_memory, expected_size) + + # flavor will have no effect on what's stored in HDF5 file + def test_numpy_int_python_flavor(self): + complevel = 0 + flavor = 'python' + expected_size = self.int_tests(complevel, flavor) + self.assertEqual(self.array.size_in_memory, expected_size) + + # this relies on knowledge of the implementation, so it's not + # a great test + def test_object_atom(self): + atom = tb.ObjectAtom() + complevel = 0 + self.create_array(atom, complevel) + obj = [1, 2, 3] + for i in range(10): + self.array.append(obj) + pickle_array = atom.toarray(obj) + expected_size = 10 * pickle_array.nbytes + self.assertEqual(self.array.size_in_memory, expected_size) + + +class SizeOnDiskPropertyTestCase(common.TempFileMixin, + common.PyTablesTestCase): + def create_array(self, atom, complevel): + filters = tb.Filters(complevel=complevel, complib='blosc') + self.h5file.create_vlarray('/', 'vlarray', atom, filters=filters) + self.array = self.h5file.get_node('/', 'vlarray') + + def test_not_implemented(self): + atom = tb.IntAtom() + complevel = 0 + self.create_array(atom, complevel) + self.assertRaises(NotImplementedError, getattr, self.array, + 'size_on_disk') + + +class AccessClosedTestCase(common.TempFileMixin, common.PyTablesTestCase): + def setUp(self): + super().setUp() + self.array = self.h5file.create_vlarray( + self.h5file.root, 'array', atom=tb.StringAtom(8)) + self.array.append([str(i) for i in range(5, 5005, 100)]) + + def test_read(self): + self.h5file.close() + self.assertRaises( + tb.ClosedNodeError, self.array.read) + + def test_getitem(self): + self.h5file.close() + self.assertRaises( + tb.ClosedNodeError, self.array.__getitem__, 0) + + def test_setitem(self): + self.h5file.close() + self.assertRaises( + tb.ClosedNodeError, self.array.__setitem__, 0, '0') + + def test_append(self): + self.h5file.close() + self.assertRaises( + tb.ClosedNodeError, self.array.append, 'xxxxxxxxx') + + +class TestCreateVLArrayArgs(common.TempFileMixin, common.PyTablesTestCase): + obj = np.array([1, 2, 3]) + where = '/' + name = 'vlarray' + atom = tb.Atom.from_dtype(obj.dtype) + title = 'title' + filters = None + expectedrows = None + chunkshape = None + byteorder = None + createparents = False + + def test_positional_args_01(self): + self.h5file.create_vlarray(self.where, self.name, + self.atom, + self.title, self.filters, + self.expectedrows) + self.h5file.close() + + self.h5file = tb.open_file(self.h5fname) + ptarr = self.h5file.get_node(self.where, self.name) + + self.assertEqual(ptarr.title, self.title) + self.assertEqual(ptarr.shape, (0,)) + self.assertEqual(ptarr.nrows, 0) + self.assertEqual(ptarr.atom, self.atom) + self.assertEqual(ptarr.atom.dtype, self.atom.dtype) + + def test_positional_args_02(self): + ptarr = self.h5file.create_vlarray(self.where, self.name, + self.atom, + self.title, + self.filters, + self.expectedrows) + ptarr.append(self.obj) + self.h5file.close() + + self.h5file = tb.open_file(self.h5fname) + ptarr = self.h5file.get_node(self.where, self.name) + nparr = ptarr.read()[0] + + self.assertEqual(ptarr.title, self.title) + self.assertEqual(ptarr.shape, (1,)) + self.assertEqual(ptarr[0].shape, self.obj.shape) + self.assertEqual(ptarr.nrows, 1) + self.assertEqual(ptarr.atom, self.atom) + self.assertEqual(ptarr.atom.dtype, self.atom.dtype) + self.assertTrue(common.allequal(self.obj, nparr)) + + def test_positional_args_obj(self): + self.h5file.create_vlarray(self.where, self.name, + None, + self.title, + self.filters, + self.expectedrows, + self.chunkshape, + self.byteorder, + self.createparents, + self.obj) + self.h5file.close() + + self.h5file = tb.open_file(self.h5fname) + ptarr = self.h5file.get_node(self.where, self.name) + nparr = ptarr.read()[0] + + self.assertEqual(ptarr.title, self.title) + self.assertEqual(ptarr.shape, (1,)) + self.assertEqual(ptarr[0].shape, self.obj.shape) + self.assertEqual(ptarr.nrows, 1) + self.assertEqual(ptarr.atom, self.atom) + self.assertEqual(ptarr.atom.dtype, self.atom.dtype) + self.assertTrue(common.allequal(self.obj, nparr)) + + def test_kwargs_obj(self): + self.h5file.create_vlarray(self.where, self.name, title=self.title, + obj=self.obj) + self.h5file.close() + + self.h5file = tb.open_file(self.h5fname) + ptarr = self.h5file.get_node(self.where, self.name) + nparr = ptarr.read()[0] + + self.assertEqual(ptarr.title, self.title) + self.assertEqual(ptarr.shape, (1,)) + self.assertEqual(ptarr[0].shape, self.obj.shape) + self.assertEqual(ptarr.nrows, 1) + self.assertEqual(ptarr.atom, self.atom) + self.assertEqual(ptarr.atom.dtype, self.atom.dtype) + self.assertTrue(common.allequal(self.obj, nparr)) + + def test_kwargs_atom_01(self): + ptarr = self.h5file.create_vlarray(self.where, self.name, + title=self.title, + atom=self.atom) + ptarr.append(self.obj) + self.h5file.close() + + self.h5file = tb.open_file(self.h5fname) + ptarr = self.h5file.get_node(self.where, self.name) + nparr = ptarr.read()[0] + + self.assertEqual(ptarr.title, self.title) + self.assertEqual(ptarr.shape, (1,)) + self.assertEqual(ptarr[0].shape, self.obj.shape) + self.assertEqual(ptarr.nrows, 1) + self.assertEqual(ptarr.atom, self.atom) + self.assertEqual(ptarr.atom.dtype, self.atom.dtype) + self.assertTrue(common.allequal(self.obj, nparr)) + + def test_kwargs_atom_02(self): + ptarr = self.h5file.create_vlarray(self.where, self.name, + title=self.title, + atom=self.atom) + # ptarr.append(self.obj) + self.h5file.close() + + self.h5file = tb.open_file(self.h5fname) + ptarr = self.h5file.get_node(self.where, self.name) + + self.assertEqual(ptarr.title, self.title) + self.assertEqual(ptarr.shape, (0,)) + self.assertEqual(ptarr.nrows, 0) + self.assertEqual(ptarr.atom, self.atom) + self.assertEqual(ptarr.atom.dtype, self.atom.dtype) + + def test_kwargs_obj_atom(self): + ptarr = self.h5file.create_vlarray(self.where, self.name, + title=self.title, + obj=self.obj, + atom=self.atom) + self.h5file.close() + + self.h5file = tb.open_file(self.h5fname) + ptarr = self.h5file.get_node(self.where, self.name) + nparr = ptarr.read()[0] + + self.assertEqual(ptarr.title, self.title) + self.assertEqual(ptarr.shape, (1,)) + self.assertEqual(ptarr[0].shape, self.obj.shape) + self.assertEqual(ptarr.nrows, 1) + self.assertEqual(ptarr.atom, self.atom) + self.assertEqual(ptarr.atom.dtype, self.atom.dtype) + self.assertTrue(common.allequal(self.obj, nparr)) + + def test_kwargs_obj_atom_error(self): + atom = tb.Atom.from_dtype(np.dtype('complex')) + # shape = self.shape + self.shape + self.assertRaises(TypeError, + self.h5file.create_vlarray, + self.where, + self.name, + title=self.title, + obj=self.obj, + atom=atom) + + +def suite(): + theSuite = common.unittest.TestSuite() + niter = 1 + + for n in range(niter): + theSuite.addTest(common.unittest.makeSuite(BasicNumPyTestCase)) + theSuite.addTest(common.unittest.makeSuite(BasicPythonTestCase)) + theSuite.addTest(common.unittest.makeSuite(ZlibComprTestCase)) + theSuite.addTest(common.unittest.makeSuite(BloscComprTestCase)) + theSuite.addTest(common.unittest.makeSuite(BloscShuffleComprTestCase)) + theSuite.addTest( + common.unittest.makeSuite(BloscBitShuffleComprTestCase)) + theSuite.addTest(common.unittest.makeSuite(BloscBloscLZComprTestCase)) + theSuite.addTest(common.unittest.makeSuite(BloscLZ4ComprTestCase)) + theSuite.addTest(common.unittest.makeSuite(BloscLZ4HCComprTestCase)) + theSuite.addTest(common.unittest.makeSuite(BloscSnappyComprTestCase)) + theSuite.addTest(common.unittest.makeSuite(BloscZlibComprTestCase)) + theSuite.addTest(common.unittest.makeSuite(BloscZstdComprTestCase)) + theSuite.addTest(common.unittest.makeSuite(LZOComprTestCase)) + theSuite.addTest(common.unittest.makeSuite(Bzip2ComprTestCase)) + theSuite.addTest(common.unittest.makeSuite(TypesReopenTestCase)) + theSuite.addTest(common.unittest.makeSuite(TypesNoReopenTestCase)) + theSuite.addTest(common.unittest.makeSuite(MDTypesNumPyTestCase)) + theSuite.addTest(common.unittest.makeSuite(OpenAppendShapeTestCase)) + theSuite.addTest(common.unittest.makeSuite(CloseAppendShapeTestCase)) + theSuite.addTest(common.unittest.makeSuite(PythonFlavorTestCase)) + theSuite.addTest(common.unittest.makeSuite(NumPyFlavorTestCase)) + theSuite.addTest(common.unittest.makeSuite(ReadRangeTestCase)) + theSuite.addTest(common.unittest.makeSuite(GetItemRangeTestCase)) + theSuite.addTest(common.unittest.makeSuite(SetRangeTestCase)) + theSuite.addTest(common.unittest.makeSuite(ShuffleComprTestCase)) + theSuite.addTest(common.unittest.makeSuite(CloseCopyTestCase)) + theSuite.addTest(common.unittest.makeSuite(OpenCopyTestCase)) + theSuite.addTest(common.unittest.makeSuite(CopyIndex1TestCase)) + theSuite.addTest(common.unittest.makeSuite(CopyIndex2TestCase)) + theSuite.addTest(common.unittest.makeSuite(CopyIndex3TestCase)) + theSuite.addTest(common.unittest.makeSuite(CopyIndex4TestCase)) + theSuite.addTest(common.unittest.makeSuite(CopyIndex5TestCase)) + theSuite.addTest(common.unittest.makeSuite(CopyIndex6TestCase)) + theSuite.addTest(common.unittest.makeSuite(CopyIndex7TestCase)) + theSuite.addTest(common.unittest.makeSuite(CopyIndex8TestCase)) + theSuite.addTest(common.unittest.makeSuite(CopyIndex9TestCase)) + theSuite.addTest(common.unittest.makeSuite(CopyIndex10TestCase)) + theSuite.addTest(common.unittest.makeSuite(CopyIndex11TestCase)) + theSuite.addTest(common.unittest.makeSuite(CopyIndex12TestCase)) + theSuite.addTest(common.unittest.makeSuite(ChunkshapeTestCase)) + theSuite.addTest(common.unittest.makeSuite(VLUEndianTestCase)) + theSuite.addTest(common.unittest.makeSuite(TruncateOpenTestCase)) + theSuite.addTest(common.unittest.makeSuite(TruncateCloseTestCase)) + theSuite.addTest(common.unittest.makeSuite(PointSelectionTestCase)) + theSuite.addTest( + common.unittest.makeSuite(SizeInMemoryPropertyTestCase)) + theSuite.addTest(common.unittest.makeSuite(SizeOnDiskPropertyTestCase)) + theSuite.addTest(common.unittest.makeSuite(AccessClosedTestCase)) + theSuite.addTest(common.unittest.makeSuite(TestCreateVLArrayArgs)) + + return theSuite + + +if __name__ == '__main__': + common.parse_argv(sys.argv) + common.print_versions() + common.unittest.main(defaultTest='suite') diff --git a/tables/tests/time-table-vlarray-1_x.h5 b/tables/tests/time-table-vlarray-1_x.h5 new file mode 100644 index 0000000..8e96bc8 Binary files /dev/null and b/tables/tests/time-table-vlarray-1_x.h5 differ diff --git a/tables/tests/times-nested-be.h5 b/tables/tests/times-nested-be.h5 new file mode 100644 index 0000000..7ff7f5a Binary files /dev/null and b/tables/tests/times-nested-be.h5 differ diff --git a/tables/tests/vlstr_attr.h5 b/tables/tests/vlstr_attr.h5 new file mode 100644 index 0000000..c2f436c Binary files /dev/null and b/tables/tests/vlstr_attr.h5 differ diff --git a/tables/tests/vlunicode_endian.h5 b/tables/tests/vlunicode_endian.h5 new file mode 100644 index 0000000..2996021 Binary files /dev/null and b/tables/tests/vlunicode_endian.h5 differ diff --git a/tables/tests/zerodim-attrs-1.3.h5 b/tables/tests/zerodim-attrs-1.3.h5 new file mode 100644 index 0000000..b4065d1 Binary files /dev/null and b/tables/tests/zerodim-attrs-1.3.h5 differ diff --git a/tables/tests/zerodim-attrs-1.4.h5 b/tables/tests/zerodim-attrs-1.4.h5 new file mode 100644 index 0000000..4b1e865 Binary files /dev/null and b/tables/tests/zerodim-attrs-1.4.h5 differ diff --git a/tables/undoredo.py b/tables/undoredo.py new file mode 100644 index 0000000..0cd8ccd --- /dev/null +++ b/tables/undoredo.py @@ -0,0 +1,149 @@ +"""Support for undoing and redoing actions. + +Functions: + +* undo(file, operation, *args) +* redo(file, operation, *args) +* move_to_shadow(file, path) +* move_from_shadow(file, path) +* attr_to_shadow(file, path, name) +* attr_from_shadow(file, path, name) + +Misc variables: + +`__docformat__` + The format of documentation strings in this module. + +""" + +from .path import split_path + + +__docformat__ = 'reStructuredText' +"""The format of documentation strings in this module.""" + + +def undo(file_, operation, *args): + if operation == 'CREATE': + undo_create(file_, args[0]) + elif operation == 'REMOVE': + undo_remove(file_, args[0]) + elif operation == 'MOVE': + undo_move(file_, args[0], args[1]) + elif operation == 'ADDATTR': + undo_add_attr(file_, args[0], args[1]) + elif operation == 'DELATTR': + undo_del_attr(file_, args[0], args[1]) + else: + raise NotImplementedError("the requested unknown operation %r can " + "not be undone; please report this to the " + "authors" % operation) + + +def redo(file_, operation, *args): + if operation == 'CREATE': + redo_create(file_, args[0]) + elif operation == 'REMOVE': + redo_remove(file_, args[0]) + elif operation == 'MOVE': + redo_move(file_, args[0], args[1]) + elif operation == 'ADDATTR': + redo_add_attr(file_, args[0], args[1]) + elif operation == 'DELATTR': + redo_del_attr(file_, args[0], args[1]) + else: + raise NotImplementedError("the requested unknown operation %r can " + "not be redone; please report this to the " + "authors" % operation) + + +def move_to_shadow(file_, path): + node = file_._get_node(path) + + (shparent, shname) = file_._shadow_name() + node._g_move(shparent, shname) + + +def move_from_shadow(file_, path): + (shparent, shname) = file_._shadow_name() + node = shparent._f_get_child(shname) + + (pname, name) = split_path(path) + parent = file_._get_node(pname) + node._g_move(parent, name) + + +def undo_create(file_, path): + move_to_shadow(file_, path) + + +def redo_create(file_, path): + move_from_shadow(file_, path) + + +def undo_remove(file_, path): + move_from_shadow(file_, path) + + +def redo_remove(file_, path): + move_to_shadow(file_, path) + + +def undo_move(file_, origpath, destpath): + (origpname, origname) = split_path(origpath) + + node = file_._get_node(destpath) + origparent = file_._get_node(origpname) + node._g_move(origparent, origname) + + +def redo_move(file_, origpath, destpath): + (destpname, destname) = split_path(destpath) + + node = file_._get_node(origpath) + destparent = file_._get_node(destpname) + node._g_move(destparent, destname) + + +def attr_to_shadow(file_, path, name): + node = file_._get_node(path) + attrs = node._v_attrs + value = getattr(attrs, name) + + (shparent, shname) = file_._shadow_name() + shattrs = shparent._v_attrs + + # Set the attribute only if it has not been kept in the shadow. + # This avoids re-pickling complex attributes on REDO. + if shname not in shattrs: + shattrs._g__setattr(shname, value) + + attrs._g__delattr(name) + + +def attr_from_shadow(file_, path, name): + (shparent, shname) = file_._shadow_name() + shattrs = shparent._v_attrs + value = getattr(shattrs, shname) + + node = file_._get_node(path) + node._v_attrs._g__setattr(name, value) + + # Keeping the attribute in the shadow allows reusing it on Undo/Redo. + # shattrs._g__delattr(shname) + + +def undo_add_attr(file_, path, name): + attr_to_shadow(file_, path, name) + + +def redo_add_attr(file_, path, name): + attr_from_shadow(file_, path, name) + + +def undo_del_attr(file_, path, name): + attr_from_shadow(file_, path, name) + + +def redo_del_attr(file_, path, name): + attr_to_shadow(file_, path, name) diff --git a/tables/unimplemented.py b/tables/unimplemented.py new file mode 100644 index 0000000..ff1bbbb --- /dev/null +++ b/tables/unimplemented.py @@ -0,0 +1,149 @@ +"""Here is defined the UnImplemented class.""" + +import warnings + +from . import hdf5extension +from .utils import SizeType +from .node import Node +from .leaf import Leaf + + +class UnImplemented(hdf5extension.UnImplemented, Leaf): + """This class represents datasets not supported by PyTables in an HDF5 + file. + + When reading a generic HDF5 file (i.e. one that has not been created with + PyTables, but with some other HDF5 library based tool), chances are that + the specific combination of datatypes or dataspaces in some dataset might + not be supported by PyTables yet. In such a case, this dataset will be + mapped into an UnImplemented instance and the user will still be able to + access the complete object tree of the generic HDF5 file. The user will + also be able to *read and write the attributes* of the dataset, *access + some of its metadata*, and perform *certain hierarchy manipulation + operations* like deleting or moving (but not copying) the node. Of course, + the user will not be able to read the actual data on it. + + This is an elegant way to allow users to work with generic HDF5 files + despite the fact that some of its datasets are not supported by + PyTables. However, if you are really interested in having full access to an + unimplemented dataset, please get in contact with the developer team. + + This class does not have any public instance variables or methods, except + those inherited from the Leaf class (see :ref:`LeafClassDescr`). + + """ + + # Class identifier. + _c_classid = 'UNIMPLEMENTED' + + def __init__(self, parentnode, name): + """Create the `UnImplemented` instance.""" + + # UnImplemented objects always come from opening an existing node + # (they can not be created). + self._v_new = False + """Is this the first time the node has been created?""" + self.nrows = SizeType(0) + """The length of the first dimension of the data.""" + self.shape = (SizeType(0),) + """The shape of the stored data.""" + self.byteorder = None + """The endianness of data in memory ('big', 'little' or + 'irrelevant').""" + + super().__init__(parentnode, name) + + def _g_open(self): + (self.shape, self.byteorder, object_id) = self._open_unimplemented() + try: + self.nrows = SizeType(self.shape[0]) + except IndexError: + self.nrows = SizeType(0) + return object_id + + def _g_copy(self, newparent, newname, recursive, _log=True, **kwargs): + """Do nothing. + + This method does nothing, but a ``UserWarning`` is issued. + Please note that this method *does not return a new node*, but + ``None``. + + """ + + warnings.warn( + "UnImplemented node %r does not know how to copy itself; skipping" + % (self._v_pathname,)) + return None # Can you see it? + + def _f_copy(self, newparent=None, newname=None, + overwrite=False, recursive=False, createparents=False, + **kwargs): + """Do nothing. + + This method does nothing, since `UnImplemented` nodes can not + be copied. However, a ``UserWarning`` is issued. Please note + that this method *does not return a new node*, but ``None``. + + """ + + # This also does nothing but warn. + self._g_copy(newparent, newname, recursive, **kwargs) + return None # Can you see it? + + def __repr__(self): + return """{} + NOTE: +""".format(str(self), self._v_file.filename) + + +# Classes reported as H5G_UNKNOWN by HDF5 +class Unknown(Node): + """This class represents nodes reported as *unknown* by the underlying + HDF5 library. + + This class does not have any public instance variables or methods, except + those inherited from the Node class. + + """ + + # Class identifier + _c_classid = 'UNKNOWN' + + def __init__(self, parentnode, name): + """Create the `Unknown` instance.""" + + self._v_new = False + super().__init__(parentnode, name) + + def _g_new(self, parentnode, name, init=False): + pass + + def _g_open(self): + return 0 + + def _g_copy(self, newparent, newname, recursive, _log=True, **kwargs): + # Silently avoid doing copies of unknown nodes + return None + + def _g_delete(self, parent): + pass + + def __str__(self): + pathname = self._v_pathname + classname = self.__class__.__name__ + return f"{pathname} ({classname})" + + def __repr__(self): + return f"""{self!s} + NOTE: +""" + + +# These are listed here for backward compatibility with PyTables 0.9.x indexes +class OldIndexArray(UnImplemented): + _c_classid = 'IndexArray' diff --git a/tables/utils.py b/tables/utils.py new file mode 100644 index 0000000..035d584 --- /dev/null +++ b/tables/utils.py @@ -0,0 +1,434 @@ +"""Utility functions.""" + +import math +import os +import sys +import warnings +import weakref +from pathlib import Path +from time import perf_counter as clock + +import numpy as np + +from .flavor import array_of_flavor + +# The map between byteorders in NumPy and PyTables +byteorders = { + '>': 'big', + '<': 'little', + '=': sys.byteorder, + '|': 'irrelevant', +} + +# The type used for size values: indexes, coordinates, dimension +# lengths, row numbers, shapes, chunk shapes, byte counts... +SizeType = np.int64 + + +def correct_byteorder(ptype, byteorder): + """Fix the byteorder depending on the PyTables types.""" + + if ptype in ['string', 'bool', 'int8', 'uint8', 'object']: + return "irrelevant" + else: + return byteorder + + +def is_idx(index): + """Checks if an object can work as an index or not.""" + + if type(index) is int: + return True + elif hasattr(index, "__index__"): + # Exclude the array([idx]) as working as an index. Fixes #303. + if (hasattr(index, "shape") and index.shape != ()): + return False + try: + index.__index__() + if isinstance(index, bool): + warnings.warn( + 'using a boolean instead of an integer will result in an ' + 'error in the future', DeprecationWarning, stacklevel=2) + return True + except TypeError: + return False + elif isinstance(index, np.integer): + return True + # For Python 2.4 one should test 0-dim and 1-dim, 1-elem arrays as well + elif (isinstance(index, np.ndarray) and (index.shape == ()) and + index.dtype.str[1] == 'i'): + return True + + return False + + +def idx2long(index): + """Convert a possible index into a long int.""" + + try: + return int(index) + except Exception: + raise TypeError("not an integer type.") + + +# This is used in VLArray and EArray to produce NumPy object compliant +# with atom from a generic python type. If copy is stated as True, it +# is assured that it will return a copy of the object and never the same +# object or a new one sharing the same memory. +def convert_to_np_atom(arr, atom, copy=False): + """Convert a generic object into a NumPy object compliant with atom.""" + + # First, convert the object into a NumPy array + nparr = array_of_flavor(arr, 'numpy') + # Copy of data if necessary for getting a contiguous buffer, or if + # dtype is not the correct one. + if atom.shape == (): + # Scalar atom case + nparr = np.array(nparr, dtype=atom.dtype, copy=copy) + else: + # Multidimensional atom case. Addresses #133. + # We need to use this strange way to obtain a dtype compliant + # array because NumPy doesn't honor the shape of the dtype when + # it is multidimensional. See: + # http://scipy.org/scipy/numpy/ticket/926 + # for details. + # All of this is done just to taking advantage of the NumPy + # broadcasting rules. + newshape = nparr.shape[:-len(atom.dtype.shape)] + nparr2 = np.empty(newshape, dtype=[('', atom.dtype)]) + nparr2['f0'][:] = nparr + # Return a view (i.e. get rid of the record type) + nparr = nparr2.view(atom.dtype) + return nparr + + +# The next is used in Array, EArray and VLArray, and it is a bit more +# high level than convert_to_np_atom +def convert_to_np_atom2(object, atom): + """Convert a generic object into a NumPy object compliant with atom.""" + + # Check whether the object needs to be copied to make the operation + # safe to in-place conversion. + copy = atom.type in ['time64'] + nparr = convert_to_np_atom(object, atom, copy) + # Finally, check the byteorder and change it if needed + byteorder = byteorders[nparr.dtype.byteorder] + if (byteorder in ['little', 'big'] and byteorder != sys.byteorder): + # The byteorder needs to be fixed (a copy is made + # so that the original array is not modified) + nparr = nparr.byteswap() + + return nparr + + +def check_file_access(filename, mode='r'): + """Check for file access in the specified `mode`. + + `mode` is one of the modes supported by `File` objects. If the file + indicated by `filename` can be accessed using that `mode`, the + function ends successfully. Else, an ``IOError`` is raised + explaining the reason of the failure. + + All this paraphernalia is used to avoid the lengthy and scaring HDF5 + messages produced when there are problems opening a file. No + changes are ever made to the file system. + + """ + + path = Path(filename).resolve() + + if mode == 'r': + # The file should be readable. + if not os.access(path, os.F_OK): + raise OSError(f"``{path}`` does not exist") + if not path.is_file(): + raise OSError(f"``{path}`` is not a regular file") + if not os.access(path, os.R_OK): + raise OSError(f"file ``{path}`` exists but it can not be read") + elif mode == 'w': + if os.access(path, os.F_OK): + # Since the file is not removed but replaced, + # it must already be accessible to read and write operations. + check_file_access(path, 'r+') + else: + # A new file is going to be created, + # so the directory should be writable. + if not os.access(path.parent, os.F_OK): + raise OSError(f"``{path.parent}`` does not exist") + if not path.parent.is_dir(): + raise OSError(f"``{path.parent}`` is not a directory") + if not os.access(path.parent, os.W_OK): + raise OSError( + f"directory ``{path.parent}`` exists but it can not be " + f"written" + ) + elif mode == 'a': + if os.access(path, os.F_OK): + check_file_access(path, 'r+') + else: + check_file_access(path, 'w') + elif mode == 'r+': + check_file_access(path, 'r') + if not os.access(path, os.W_OK): + raise OSError(f"file ``{path}`` exists but it can not be written") + else: + raise ValueError(f"invalid mode: {mode!r}") + + +def lazyattr(fget): + """Create a *lazy attribute* from the result of `fget`. + + This function is intended to be used as a *method decorator*. It + returns a *property* which caches the result of calling the `fget` + instance method. The docstring of `fget` is used for the property + itself. For instance: + + >>> class MyClass(object): + ... @lazyattr + ... def attribute(self): + ... 'Attribute description.' + ... print('creating value') + ... return 10 + ... + >>> type(MyClass.attribute) + + >>> MyClass.attribute.__doc__ + 'Attribute description.' + >>> obj = MyClass() + >>> obj.__dict__ + {} + >>> obj.attribute + creating value + 10 + >>> obj.__dict__ + {'attribute': 10} + >>> obj.attribute + 10 + >>> del obj.attribute + Traceback (most recent call last): + ... + AttributeError: can't delete attribute + + .. warning:: + + Please note that this decorator *changes the type of the + decorated object* from an instance method into a property. + + """ + + name = fget.__name__ + + def newfget(self): + mydict = self.__dict__ + if name in mydict: + return mydict[name] + mydict[name] = value = fget(self) + return value + + return property(newfget, None, None, fget.__doc__) + + +def show_stats(explain, tref, encoding=None): + """Show the used memory (only works for Linux 2.6.x).""" + + for line in Path('/proc/self/status').read_text().splitlines(): + if line.startswith("VmSize:"): + vmsize = int(line.split()[1]) + elif line.startswith("VmRSS:"): + vmrss = int(line.split()[1]) + elif line.startswith("VmData:"): + vmdata = int(line.split()[1]) + elif line.startswith("VmStk:"): + vmstk = int(line.split()[1]) + elif line.startswith("VmExe:"): + vmexe = int(line.split()[1]) + elif line.startswith("VmLib:"): + vmlib = int(line.split()[1]) + print("Memory usage: ******* %s *******" % explain) + print(f"VmSize: {vmsize:>7} kB\tVmRSS: {vmrss:>7} kB") + print(f"VmData: {vmdata:>7} kB\tVmStk: {vmstk:>7} kB") + print(f"VmExe: {vmexe:>7} kB\tVmLib: {vmlib:>7} kB") + tnow = clock() + print(f"WallClock time: {tnow - tref:.3f}") + return tnow + + +# truncate data before calling __setitem__, to improve compression ratio +# this function is taken verbatim from netcdf4-python +def quantize(data, least_significant_digit): + """quantize data to improve compression. + + Data is quantized using around(scale*data)/scale, where scale is + 2**bits, and bits is determined from the least_significant_digit. + + For example, if least_significant_digit=1, bits will be 4. + + """ + + exp = -least_significant_digit + exp = math.floor(exp) if exp < 0 else math.ceil(exp) + bits = math.ceil(math.log2(10 ** -exp)) + scale = 2 ** bits + datout = np.around(scale * data) / scale + + return datout + + +# Utilities to detect leaked instances. See recipe 14.10 of the Python +# Cookbook by Martelli & Ascher. +tracked_classes = {} + + +def log_instance_creation(instance, name=None): + if name is None: + name = instance.__class__.__name__ + if name not in tracked_classes: + tracked_classes[name] = [] + tracked_classes[name].append(weakref.ref(instance)) + + +def string_to_classes(s): + if s == '*': + c = sorted(tracked_classes) + return c + else: + return s.split() + + +def fetch_logged_instances(classes="*"): + classnames = string_to_classes(classes) + return [(cn, len(tracked_classes[cn])) for cn in classnames] + + +def count_logged_instances(classes, file=sys.stdout): + for classname in string_to_classes(classes): + file.write("%s: %d\n" % (classname, len(tracked_classes[classname]))) + + +def list_logged_instances(classes, file=sys.stdout): + for classname in string_to_classes(classes): + file.write('\n%s:\n' % classname) + for ref in tracked_classes[classname]: + obj = ref() + if obj is not None: + file.write(' %s\n' % repr(obj)) + + +def dump_logged_instances(classes, file=sys.stdout): + for classname in string_to_classes(classes): + file.write('\n%s:\n' % classname) + for ref in tracked_classes[classname]: + obj = ref() + if obj is not None: + file.write(' %s:\n' % obj) + for key, value in obj.__dict__.items(): + file.write(f' {key:>20} : {value}\n') + + +# +# A class useful for cache usage +# +class CacheDict(dict): + """A dictionary that prevents itself from growing too much.""" + + def __init__(self, maxentries): + self.maxentries = maxentries + super().__init__(self) + + def __setitem__(self, key, value): + # Protection against growing the cache too much + if len(self) > self.maxentries: + # Remove a 10% of (arbitrary) elements from the cache + entries_to_remove = self.maxentries / 10 + for k in list(self)[:entries_to_remove]: + super().__delitem__(k) + super().__setitem__(key, value) + + +class NailedDict: + """A dictionary which ignores its items when it has nails on it.""" + + def __init__(self, maxentries): + self.maxentries = maxentries + self._cache = {} + self._nailcount = 0 + + # Only a restricted set of dictionary methods are supported. That + # is why we buy instead of inherit. + + # The following are intended to be used by ``Table`` code changing + # the set of usable indexes. + + def clear(self): + self._cache.clear() + + def nail(self): + self._nailcount += 1 + + def unnail(self): + self._nailcount -= 1 + + # The following are intended to be used by ``Table`` code handling + # conditions. + + def __contains__(self, key): + if self._nailcount > 0: + return False + return key in self._cache + + def __getitem__(self, key): + if self._nailcount > 0: + raise KeyError(key) + return self._cache[key] + + def get(self, key, default=None): + if self._nailcount > 0: + return default + return self._cache.get(key, default) + + def __setitem__(self, key, value): + if self._nailcount > 0: + return + cache = self._cache + # Protection against growing the cache too much + if len(cache) > self.maxentries: + # Remove a 10% of (arbitrary) elements from the cache + entries_to_remove = max(self.maxentries // 10, 1) + for k in list(cache)[:entries_to_remove]: + del cache[k] + cache[key] = value + + +def detect_number_of_cores(): + """Detects the number of cores on a system. + + Cribbed from pp. + + """ + + # Linux, Unix and MacOS: + if hasattr(os, "sysconf"): + if "SC_NPROCESSORS_ONLN" in os.sysconf_names: + # Linux & Unix: + ncpus = os.sysconf("SC_NPROCESSORS_ONLN") + if isinstance(ncpus, int) and ncpus > 0: + return ncpus + else: # OSX: + return int(os.popen2("sysctl -n hw.ncpu")[1].read()) + # Windows: + if "NUMBER_OF_PROCESSORS" in os.environ: + ncpus = int(os.environ["NUMBER_OF_PROCESSORS"]) + if ncpus > 0: + return ncpus + return 1 # Default + + +def _test(): + """Run ``doctest`` on this module.""" + + import doctest + doctest.testmod() + + +if __name__ == '__main__': + _test() diff --git a/tables/utilsextension.pxd b/tables/utilsextension.pxd new file mode 100644 index 0000000..c3f0a60 --- /dev/null +++ b/tables/utilsextension.pxd @@ -0,0 +1,22 @@ +######################################################################## +# +# License: BSD +# Created: March 03, 2008 +# Author: Francesc Alted - faltet@pytables.com +# +# $Id: definitions.pyd 1018 2005-06-20 09:43:34Z faltet $ +# +######################################################################## + +""" +These are declarations for functions in utilsextension.pyx that have to +be shared with other extensions. +""" + +from .definitions cimport hsize_t, hid_t, hobj_ref_t +from numpy cimport ndarray + +cdef hsize_t *malloc_dims(object) +cdef hid_t get_native_type(hid_t) nogil +cdef str cstr_to_pystr(const char*) +cdef int load_reference(hid_t dataset_id, hobj_ref_t *refbuf, size_t item_size, ndarray nparr) except -1 diff --git a/tables/utilsextension.pyx b/tables/utilsextension.pyx new file mode 100644 index 0000000..cd75005 --- /dev/null +++ b/tables/utilsextension.pyx @@ -0,0 +1,1488 @@ +######################################################################## +# +# License: BSD +# Created: May 20, 2005 +# Author: Francesc Alted - faltet@pytables.com +# +# $Id$ +# +######################################################################## + +"""Cython utilities for PyTables and HDF5 library.""" + +import os +import sys +import warnings + +try: + import zlib + zlib_imported = True +except ImportError: + zlib_imported = False + +import numpy + +from .description import Description, Col +from .misc.enum import Enum +from .exceptions import HDF5ExtError +from .atom import Atom, EnumAtom, ReferenceAtom + +from .utils import check_file_access + +from libc.stdio cimport stderr +from libc.stdlib cimport malloc, free +from libc.string cimport strchr, strcmp, strncmp, strlen +from cpython.bytes cimport PyBytes_Check, PyBytes_FromStringAndSize +from cpython.unicode cimport PyUnicode_DecodeUTF8, PyUnicode_Check + +from numpy cimport (import_array, ndarray, dtype, + npy_int64, PyArray_DATA, PyArray_GETPTR1, PyArray_DescrFromType, npy_intp, + NPY_BOOL, NPY_STRING, NPY_INT8, NPY_INT16, NPY_INT32, NPY_INT64, + NPY_UINT8, NPY_UINT16, NPY_UINT32, NPY_UINT64, NPY_FLOAT16, NPY_FLOAT32, + NPY_FLOAT64, NPY_COMPLEX64, NPY_COMPLEX128) + +from .definitions cimport (H5ARRAYget_info, H5ARRAYget_ndims, + H5ATTRfind_attribute, H5ATTRget_attribute_string, H5D_CHUNKED, + H5D_layout_t, H5Dclose, H5Dget_type, H5Dopen, H5E_DEFAULT, + H5E_WALK_DOWNWARD, H5E_auto_t, H5E_error_t, H5E_walk_t, H5Eget_msg, + H5Eprint, H5Eset_auto, H5Ewalk, H5F_ACC_RDONLY, H5Fclose, H5Fis_hdf5, + H5Fopen, H5Gclose, H5Gopen, H5P_DEFAULT, H5T_ARRAY, H5T_BITFIELD, + H5T_COMPOUND, H5T_CSET_ASCII, H5T_CSET_UTF8, H5T_C_S1, H5T_DIR_DEFAULT, + H5T_ENUM, H5T_FLOAT, H5T_IEEE_F32BE, H5T_IEEE_F32LE, H5T_IEEE_F64BE, + H5T_IEEE_F64LE, H5T_INTEGER, H5T_NATIVE_DOUBLE, H5T_NATIVE_LDOUBLE, + H5T_NO_CLASS, H5T_OPAQUE, + H5T_ORDER_BE, H5T_ORDER_LE, H5T_REFERENCE, H5T_STD_B8BE, H5T_STD_B8LE, + H5T_STD_I16BE, H5T_STD_I16LE, H5T_STD_I32BE, H5T_STD_I32LE, H5T_STD_I64BE, + H5T_STD_I64LE, H5T_STD_I8BE, H5T_STD_I8LE, H5T_STD_U16BE, H5T_STD_U16LE, + H5T_STD_U32BE, H5T_STD_U32LE, H5T_STD_U64BE, H5T_STD_U64LE, H5T_STD_U8BE, + H5T_STD_U8LE, H5T_STRING, H5T_TIME, H5T_UNIX_D32BE, H5T_UNIX_D32LE, + H5T_UNIX_D64BE, H5T_UNIX_D64LE, H5T_VLEN, H5T_class_t, H5T_sign_t, + H5Tarray_create, H5Tclose, H5Tequal, H5Tcopy, H5Tcreate, H5Tenum_create, + H5Tenum_insert, H5Tget_array_dims, H5Tget_array_ndims, H5Tget_class, + H5Tget_member_name, H5Tget_member_type, H5Tget_member_value, + H5Tget_native_type, H5Tget_nmembers, H5Tget_offset, H5Tget_order, + H5Tget_member_offset, + H5Tget_precision, H5Tget_sign, H5Tget_size, H5Tget_super, H5Tinsert, + H5Tis_variable_str, H5Tpack, H5Tset_precision, H5Tset_size, H5Tvlen_create, + H5Zunregister, FILTER_BLOSC, + PyArray_Scalar, create_ieee_complex128, create_ieee_complex64, + create_ieee_float16, create_ieee_complex192, create_ieee_complex256, + get_len_of_range, get_order, herr_t, hid_t, hsize_t, + hssize_t, htri_t, is_complex, register_blosc, set_order, + pt_H5free_memory, H5T_STD_REF_OBJ, H5Rdereference, H5R_OBJECT, H5I_DATASET, H5I_REFERENCE, + H5Iget_type, hobj_ref_t, H5Oclose) + + + +# Platform-dependent types +if sys.byteorder == "little": + platform_byteorder = H5T_ORDER_LE + # Standard types, independent of the byteorder + H5T_STD_B8 = H5T_STD_B8LE + H5T_STD_I8 = H5T_STD_I8LE + H5T_STD_I16 = H5T_STD_I16LE + H5T_STD_I32 = H5T_STD_I32LE + H5T_STD_I64 = H5T_STD_I64LE + H5T_STD_U8 = H5T_STD_U8LE + H5T_STD_U16 = H5T_STD_U16LE + H5T_STD_U32 = H5T_STD_U32LE + H5T_STD_U64 = H5T_STD_U64LE + H5T_IEEE_F32 = H5T_IEEE_F32LE + H5T_IEEE_F64 = H5T_IEEE_F64LE + H5T_UNIX_D32 = H5T_UNIX_D32LE + H5T_UNIX_D64 = H5T_UNIX_D64LE +else: # sys.byteorder == "big" + platform_byteorder = H5T_ORDER_BE + # Standard types, independent of the byteorder + H5T_STD_B8 = H5T_STD_B8BE + H5T_STD_I8 = H5T_STD_I8BE + H5T_STD_I16 = H5T_STD_I16BE + H5T_STD_I32 = H5T_STD_I32BE + H5T_STD_I64 = H5T_STD_I64BE + H5T_STD_U8 = H5T_STD_U8BE + H5T_STD_U16 = H5T_STD_U16BE + H5T_STD_U32 = H5T_STD_U32BE + H5T_STD_U64 = H5T_STD_U64BE + H5T_IEEE_F32 = H5T_IEEE_F32BE + H5T_IEEE_F64 = H5T_IEEE_F64BE + H5T_UNIX_D32 = H5T_UNIX_D32BE + H5T_UNIX_D64 = H5T_UNIX_D64BE + + +#---------------------------------------------------------------------------- + +# Conversion from PyTables string types to HDF5 native types +# List only types that are susceptible of changing byteorder +# (complex & enumerated types are special and should not be listed here) +pttype_to_hdf5 = { + 'int8' : H5T_STD_I8, 'uint8' : H5T_STD_U8, + 'int16' : H5T_STD_I16, 'uint16' : H5T_STD_U16, + 'int32' : H5T_STD_I32, 'uint32' : H5T_STD_U32, + 'int64' : H5T_STD_I64, 'uint64' : H5T_STD_U64, + 'float32': H5T_IEEE_F32, 'float64': H5T_IEEE_F64, + 'float96': H5T_NATIVE_LDOUBLE, 'float128': H5T_NATIVE_LDOUBLE, + 'time32' : H5T_UNIX_D32, 'time64' : H5T_UNIX_D64, +} + +# Special cases whose byteorder cannot be directly changed +pt_special_kinds = ['complex', 'string', 'enum', 'bool'] + +# Conversion table from NumPy extended codes prefixes to PyTables kinds +npext_prefixes_to_ptkinds = { + "S": "string", + "b": "bool", + "i": "int", + "u": "uint", + "f": "float", + "c": "complex", + "t": "time", + "e": "enum", +} + +# Names of HDF5 classes +hdf5_class_to_string = { + H5T_NO_CLASS : 'H5T_NO_CLASS', + H5T_INTEGER : 'H5T_INTEGER', + H5T_FLOAT : 'H5T_FLOAT', + H5T_TIME : 'H5T_TIME', + H5T_STRING : 'H5T_STRING', + H5T_BITFIELD : 'H5T_BITFIELD', + H5T_OPAQUE : 'H5T_OPAQUE', + H5T_COMPOUND : 'H5T_COMPOUND', + H5T_REFERENCE : 'H5T_REFERENCE', + H5T_ENUM : 'H5T_ENUM', + H5T_VLEN : 'H5T_VLEN', + H5T_ARRAY : 'H5T_ARRAY', +} + + +# Depprecated API +PTTypeToHDF5 = pttype_to_hdf5 +PTSpecialKinds = pt_special_kinds +NPExtPrefixesToPTKinds = npext_prefixes_to_ptkinds +HDF5ClassToString = hdf5_class_to_string + + +from numpy import sctypeDict +cdef int have_float16 = ("float16" in sctypeDict) + + +#---------------------------------------------------------------------- + +# External declarations + + +# PyTables helper routines. +cdef extern from "utils.h": + + int getLibrary(char *libname) nogil + #object getZLIBVersionInfo() + object getHDF5VersionInfo() + object get_filter_names( hid_t loc_id, char *dset_name) + + H5T_class_t getHDF5ClassID(hid_t loc_id, char *name, H5D_layout_t *layout, + hid_t *type_id, hid_t *dataset_id) nogil + + +# Functions from Blosc +cdef extern from "blosc.h" nogil: + void blosc_init() + int blosc_set_nthreads(int nthreads) + const char* blosc_list_compressors() + int blosc_compcode_to_compname(int compcode, char **compname) + int blosc_get_complib_info(char *compname, char **complib, char **version) + +cdef extern from "H5ARRAY.h" nogil: + herr_t H5ARRAYread(hid_t dataset_id, hid_t type_id, + hsize_t start, hsize_t nrows, hsize_t step, + int extdim, void *data) + +# @TODO: use the c_string_type and c_string_encoding global directives +# (new in cython 0.19) +# TODO: drop +cdef str cstr_to_pystr(const char* cstring): + return cstring.decode('utf-8') + + +#---------------------------------------------------------------------- +# Initialization code + +# The NumPy API requires this function to be called before +# using any NumPy facilities in an extension module. +import_array() + +# NaN-aware sorting with NaN as the greatest element +# numpy.isnan only takes floats, this should work for strings too +cpdef nan_aware_lt(a, b): return a < b or (b != b and a == a) +cpdef nan_aware_le(a, b): return a <= b or b != b +cpdef nan_aware_gt(a, b): return a > b or (a != a and b == b) +cpdef nan_aware_ge(a, b): return a >= b or a != a + +def bisect_left(a, x, int lo=0): + """Return the index where to insert item x in list a, assuming a is sorted. + + The return value i is such that all e in a[:i] have e < x, and all e in + a[i:] have e >= x. So if x already appears in the list, i points just + before the leftmost x already there. + + """ + + cdef int mid, hi = len(a) + + lo = 0 + while lo < hi: + mid = (lo+hi)//2 + if nan_aware_lt(a[mid], x): lo = mid+1 + else: hi = mid + return lo + +def bisect_right(a, x, int lo=0): + """Return the index where to insert item x in list a, assuming a is sorted. + + The return value i is such that all e in a[:i] have e <= x, and all e in + a[i:] have e > x. So if x already appears in the list, i points just + beyond the rightmost x already there. + + """ + + cdef int mid, hi = len(a) + + lo = 0 + while lo < hi: + mid = (lo+hi)//2 + if nan_aware_lt(x, a[mid]): hi = mid + else: lo = mid+1 + return lo + +cdef register_blosc_(): + cdef char *version + cdef char *date + + register_blosc(&version, &date) + compinfo = (version, date) + free(version) + free(date) + return compinfo[0].decode('ascii'), compinfo[1].decode('ascii') + +blosc_version = register_blosc_() + +# Old versions (<1.4) of the blosc compression library +# rely on unaligned memory access, so they are not functional on some +# platforms (see https://github.com/FrancescAlted/blosc/issues/3 and +# http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=661286). +# This function has been written by Julian Taylor . +def _arch_without_blosc(): + import platform + arch = platform.machine().lower() + for a in ("arm", "sparc", "mips", "aarch64"): + if a in arch: + return True + return False + +if blosc_version and blosc_version < ('1', '4') and _arch_without_blosc(): + # Only use bloc compressor on platforms that actually support it. + H5Zunregister(FILTER_BLOSC) + blosc_version = None +else: + blosc_init() # from 1.2 on, Blosc library must be initialized + + +# Important: Blosc calls that modifies global variables in Blosc must be +# called from the same extension where Blosc is registered in HDF5. +def set_blosc_max_threads(nthreads): + """set_blosc_max_threads(nthreads) + + Set the maximum number of threads that Blosc can use. + + This actually overrides the :data:`tables.parameters.MAX_BLOSC_THREADS` + setting in :mod:`tables.parameters`, so the new value will be effective until + this function is called again or a new file with a different + :data:`tables.parameters.MAX_BLOSC_THREADS` value is specified. + + Returns the previous setting for maximum threads. + + """ + + return blosc_set_nthreads(nthreads) + + + + +if sys.platform == "win32": + # We need a different approach in Windows, because it complains when + # trying to import the extension that is linked with a dynamic library + # that is not installed in the system. + + # Initialize & register lzo + if getLibrary("lzo2") == 0 or getLibrary("lzo1") == 0: + import tables._comp_lzo + lzo_version = tables._comp_lzo.register_() + else: + lzo_version = None + + # Initialize & register bzip2 + if getLibrary("bzip2") == 0 or getLibrary("libbz2") == 0: + import tables._comp_bzip2 + bzip2_version = tables._comp_bzip2.register_() + else: + bzip2_version = None + +else: # Unix systems + # Initialize & register lzo + try: + import tables._comp_lzo + lzo_version = tables._comp_lzo.register_() + except ImportError: + lzo_version = None + + # Initialize & register bzip2 + try: + import tables._comp_bzip2 + bzip2_version = tables._comp_bzip2.register_() + except ImportError: + bzip2_version = None + + +# End of initialization code +#--------------------------------------------------------------------- + +# Error handling helpers +cdef herr_t e_walk_cb(unsigned n, const H5E_error_t *err, void *data) with gil: + cdef object bt = data # list + #cdef char major_msg[256] + #cdef char minor_msg[256] + #cdef ssize_t msg_len + + if err == NULL: + return -1 + + #msg_len = H5Eget_msg(err.maj_num, NULL, major_msg, 256) + #if msg_len < 0: + # major_msg[0] = '\0' + + #msg_len = H5Eget_msg(err.min_num, NULL, minor_msg, 256) + #if msg_len < 0: + # minor_msg[0] = '\0' + + #msg = "%s (MAJOR: %s, MINOR: %s)" % ( + # bytes(err.desc).decode('utf-8'), + # bytes(major_msg).decode('utf-8'), + # bytes(minor_msg).decode('utf-8')) + + msg = bytes(err.desc).decode('utf-8') + + bt.append(( + bytes(err.file_name).decode('utf-8'), + err.line, + bytes(err.func_name).decode('utf-8'), + msg, + )) + + return 0 + + +def _dump_h5_backtrace(): + cdef object bt = [] + + if H5Ewalk(H5E_DEFAULT, H5E_WALK_DOWNWARD, e_walk_cb, bt) < 0: + return None + + return bt + + +# Initialization of the _dump_h5_backtrace method of HDF5ExtError. +# The unusual machinery is needed in order to avoid cirdular dependencies +# between modules. +HDF5ExtError._dump_h5_backtrace = _dump_h5_backtrace + + +def silence_hdf5_messages(silence=True): + """silence_hdf5_messages(silence=True) + + Silence (or re-enable) messages from the HDF5 C library. + + The *silence* parameter can be used control the behaviour and reset + the standard HDF5 logging. + + .. versionadded:: 2.4 + + """ + cdef herr_t err + if silence: + err = H5Eset_auto(H5E_DEFAULT, NULL, NULL) + else: + err = H5Eset_auto(H5E_DEFAULT, H5Eprint, stderr) + if err < 0: + raise HDF5ExtError("unable to configure HDF5 internal error handling") + + + + +# Disable automatic HDF5 error logging +silence_hdf5_messages() + + +def _broken_hdf5_long_double(): + # HDF5 < 1.8.12 has a bug that prevents correct identification of the + # long double data type when the code is built with gcc 4.8. + # See also: http://hdf-forum.184993.n3.nabble.com/Issues-with-H5T-NATIVE-LDOUBLE-tt4026450.html + + return H5Tget_order(H5T_NATIVE_DOUBLE) != H5Tget_order(H5T_NATIVE_LDOUBLE) + + +# Helper functions +cdef hsize_t *malloc_dims(object pdims): + """Return a malloced hsize_t dims from a python pdims.""" + + cdef int i, rank + cdef hsize_t *dims + + dims = NULL + rank = len(pdims) + if rank > 0: + dims = malloc(rank * sizeof(hsize_t)) + for i in range(rank): + dims[i] = pdims[i] + return dims + + +cdef hid_t get_native_float_type(hid_t type_id) nogil: + """Get a native type of an HDF5 float type. + + This function also handles half precision (float16) data type. + + """ + + cdef hid_t native_type_id + cdef size_t precision + + precision = H5Tget_precision(type_id) + + if precision == 16 and have_float16: + native_type_id = create_ieee_float16(NULL) + else: + native_type_id = H5Tget_native_type(type_id, H5T_DIR_DEFAULT) + + return native_type_id + + +# This routine is more complex than required because HDF5 1.6.x does +# not implement support for H5Tget_native_type with some types, like +# H5T_BITFIELD and probably others. When 1.8.x would be a requisite, +# this can be simplified. +cdef hid_t get_native_type(hid_t type_id) nogil: + """Get the native type of a HDF5 type.""" + + cdef H5T_class_t class_id, super_class_id + cdef hid_t native_type_id = 0, super_type_id, native_super_type_id + cdef int rank + cdef hsize_t *dims + + class_id = H5Tget_class(type_id) + if class_id == H5T_COMPOUND: + return H5Tget_native_type(type_id, H5T_DIR_DEFAULT) + + elif class_id in (H5T_ARRAY, H5T_VLEN): + # Get the array base component + super_type_id = H5Tget_super(type_id) + # Get the class + super_class_id = H5Tget_class(super_type_id) + if super_class_id == H5T_FLOAT: + # replicate the logic of H5Tget_native_type for H5T_ARRAY and + # H5T_VLEN taking into account extended floating point types + # XXX: HDF5 error check + native_super_type_id = get_native_float_type(super_type_id) + H5Tclose(super_type_id) + if class_id == H5T_ARRAY: + rank = H5Tget_array_ndims(type_id) + dims = malloc(rank * sizeof(hsize_t)) + H5Tget_array_dims(type_id, dims) + native_type_id = H5Tarray_create(native_super_type_id, rank, dims) + free(dims) + H5Tclose(native_super_type_id) + return native_type_id + elif class_id == H5T_VLEN: + native_type_id = H5Tvlen_create(native_super_type_id) + H5Tclose(native_super_type_id) + return native_type_id + class_id = super_class_id + H5Tclose(super_type_id) + + if class_id == H5T_FLOAT: + native_type_id = get_native_float_type(type_id) + elif class_id in (H5T_INTEGER, H5T_ENUM): + native_type_id = H5Tget_native_type(type_id, H5T_DIR_DEFAULT) + else: + # Fixing the byteorder for other types shouldn't be needed. + # More in particular, H5T_TIME is not managed yet by HDF5 and so this + # has to be managed explicitely inside the PyTables extensions. + # Regarding H5T_BITFIELD, well, I'm not sure if changing the byteorder + # of this is a good idea at all. + native_type_id = H5Tcopy(type_id) + + return native_type_id + + +def encode_filename(object filename): + """Return the encoded filename in the filesystem encoding.""" + + cdef bytes encname + + if hasattr(os, 'fspath'): + filename = os.fspath(filename) + + if isinstance(filename, (unicode, numpy.str_)): +# if type(filename) is unicode: + encoding = sys.getfilesystemencoding() + encname = filename.encode(encoding, 'replace') + else: + encname = filename + + return encname + + +# Main functions +def is_hdf5_file(object filename): + """is_hdf5_file(filename) + + Determine whether a file is in the HDF5 format. + + When successful, it returns a true value if the file is an HDF5 + file, false otherwise. If there were problems identifying the file, + an HDF5ExtError is raised. + + """ + + # Check that the file exists and is readable. + check_file_access(filename) + + # Encode the filename in case it is unicode + encname = encode_filename(filename) + + ret = H5Fis_hdf5(encname) + if ret < 0: + raise HDF5ExtError("problems identifying file ``%s``" % (filename,)) + return ret > 0 + + + + +def is_pytables_file(object filename): + """is_pytables_file(filename) + + Determine whether a file is in the PyTables format. + + When successful, it returns the format version string if the file is a + PyTables file, None otherwise. If there were problems identifying the + file, an HDF5ExtError is raised. + + """ + + cdef hid_t file_id + cdef object isptf = None # A PYTABLES_FORMAT_VERSION attribute was not found + + if is_hdf5_file(filename): + # Encode the filename in case it is unicode + encname = encode_filename(filename) + # The file exists and is HDF5, that's ok + # Open it in read-only mode + file_id = H5Fopen(encname, H5F_ACC_RDONLY, H5P_DEFAULT) + isptf = read_f_attr(file_id, 'PYTABLES_FORMAT_VERSION') + # Close the file + H5Fclose(file_id) + + # system attributes should always be str + if PyBytes_Check(isptf): + isptf = isptf.decode('utf-8') + + return isptf + + + + +def get_hdf5_version(): + """Get the underlying HDF5 library version""" + + return getHDF5VersionInfo()[1] + + +def which_lib_version(str name): + """which_lib_version(name) + + Get version information about a C library. + + If the library indicated by name is available, this function returns a + 3-tuple containing the major library version as an integer, its full version + as a string, and the version date as a string. If the library is not + available, None is returned. + + The currently supported library names are hdf5, zlib, lzo, bzip2, and blosc. If + another name is given, a ValueError is raised. + + """ + + cdef char *cname = NULL + cdef bytes encoded_name + + encoded_name = name.encode('utf-8') + # get the C pointer + cname = encoded_name + + libnames = ('hdf5', 'zlib', 'lzo', 'bzip2', 'blosc') + + if strcmp(cname, "hdf5") == 0: + binver, strver = getHDF5VersionInfo() + return (binver, strver, None) # Should be always available + elif strcmp(cname, "zlib") == 0: + if zlib_imported: + return (1, zlib.ZLIB_VERSION, None) + elif strcmp(cname, "lzo") == 0: + if lzo_version: + (lzo_version_string, lzo_version_date) = lzo_version + return (lzo_version, lzo_version_string, lzo_version_date) + elif strcmp(cname, "bzip2") == 0: + if bzip2_version: + (bzip2_version_string, bzip2_version_date) = bzip2_version + return (bzip2_version, bzip2_version_string, bzip2_version_date) + elif strncmp(cname, "blosc", 5) == 0: + if blosc_version: + (blosc_version_string, blosc_version_date) = blosc_version + return (blosc_version, blosc_version_string, blosc_version_date) + else: + raise ValueError("asked version of unsupported library ``%s``; " + "supported library names are ``%s``" % (name, libnames)) + + # A supported library was specified, but no version is available. + return None + + + + +# A function returning all the compressors supported by local Blosc +def blosc_compressor_list(): + """ + blosc_compressor_list() + + Returns a list of compressors available in the Blosc build. + + Parameters + ---------- + None + + Returns + ------- + out : list + The list of names. + """ + list_compr = blosc_list_compressors().decode() + clist = [str(cname) for cname in list_compr.split(',')] + return clist + + +# Convert compressor code to compressor name +def blosc_compcode_to_compname_(compcode): + """ + blosc_compcode_to_compname() + + Returns the compressor name associated with compressor code. + + Parameters + ---------- + None + + Returns + ------- + out : string + The name of the compressor. + """ + cdef const char *cname + cdef object compname + + compname = b"unknown (report this to developers)" + if blosc_compcode_to_compname(compcode, &cname) >= 0: + compname = cname + return compname.decode() + + +def blosc_get_complib_info_(): + """Get info from compression libraries included in the current build + of blosc. + + Returns a mapping containing the compressor names as keys and the + tuple (complib, version) as values. + + """ + + cdef char *complib + cdef char *version + + cinfo = {} + for name in blosc_list_compressors().split(b','): + ret = blosc_get_complib_info(name, &complib, &version) + if ret < 0: + continue + if isinstance(name, str): + cinfo[name] = (complib, version) + else: + cinfo[name.decode()] = (complib.decode(), version.decode()) + free(complib) + free(version) + + return cinfo + + +def which_class(hid_t loc_id, object name): + """Detects a class ID using heuristics.""" + + cdef H5T_class_t class_id + cdef H5D_layout_t layout + cdef hsize_t nfields + cdef char *field_name1 + cdef char *field_name2 + cdef int i + cdef hid_t type_id, dataset_id + cdef object classId + cdef int rank + cdef hsize_t *dims + cdef hsize_t *maxdims + cdef char byteorder[11] # "irrelevant" fits easily here + cdef bytes encoded_name + + if isinstance(name, unicode): + encoded_name = name.encode('utf-8') + else: + encoded_name = name + + classId = "UNSUPPORTED" # default value + # Get The HDF5 class for the datatype in this dataset + class_id = getHDF5ClassID(loc_id, encoded_name, &layout, &type_id, + &dataset_id) + # Check if this a dataset of supported classtype for ARRAY + if ((class_id == H5T_INTEGER) or + (class_id == H5T_FLOAT) or + (class_id == H5T_BITFIELD) or + (class_id == H5T_TIME) or + (class_id == H5T_ENUM) or + (class_id == H5T_STRING) or + (class_id == H5T_ARRAY) or + (class_id == H5T_REFERENCE)): + if layout == H5D_CHUNKED: + if H5ARRAYget_ndims(dataset_id, &rank) < 0: + raise HDF5ExtError("Problems getting ndims.") + dims = malloc(rank * sizeof(hsize_t)) + maxdims = malloc(rank * sizeof(hsize_t)) + if H5ARRAYget_info(dataset_id, type_id, dims, maxdims, + &class_id, byteorder) < 0: + raise HDF5ExtError("Unable to get array info.") + classId = "CARRAY" + # Check whether some dimension is enlargeable + for i in range(rank): + if maxdims[i] == -1: + classId = "EARRAY" + break + free(dims) + free(maxdims) + else: + classId = "ARRAY" + + elif class_id == H5T_COMPOUND: + # check whether the type is complex or not + iscomplex = False + nfields = H5Tget_nmembers(type_id) + if nfields == 2: + field_name1 = H5Tget_member_name(type_id, 0) + field_name2 = H5Tget_member_name(type_id, 1) + # The pair ("r", "i") is for PyTables. ("real", "imag") for Octave. + if ( (strcmp(field_name1, "real") == 0 and + strcmp(field_name2, "imag") == 0) or + (strcmp(field_name1, "r") == 0 and + strcmp(field_name2, "i") == 0) ): + iscomplex = True + pt_H5free_memory(field_name1) + pt_H5free_memory(field_name2) + if layout == H5D_CHUNKED: + if iscomplex: + classId = "CARRAY" + else: + classId = "TABLE" + else: # Not chunked case + # Octave saves complex arrays as non-chunked tables + # with two fields: "real" and "imag" + # Francesc Alted 2005-04-29 + # Get number of records + if iscomplex: + classId = "ARRAY" # It is probably an Octave complex array + else: + # Added to support non-chunked tables + classId = "TABLE" # A test for supporting non-growable tables + + elif class_id == H5T_VLEN: + if layout == H5D_CHUNKED: + classId = "VLARRAY" + + # Release the datatype. + H5Tclose(type_id) + + # Close the dataset. + H5Dclose(dataset_id) + + # Fallback + return classId + + + + +def get_nested_field(recarray, fieldname): + """Get the maybe nested field named `fieldname` from the `recarray`. + + The `fieldname` may be a simple field name or a nested field name + with slah-separated components. + + """ + + cdef bytes name = fieldname.encode('utf-8') + try: + if strchr(name, 47) != NULL: # ord('/') == 47 + # It may be convenient to implement this way of descending nested + # fields into the ``__getitem__()`` method of a subclass of + # ``numpy.ndarray``. -- ivb + field = recarray + for nfieldname in fieldname.split('/'): + field = field[nfieldname] + else: + # Faster method for non-nested columns + field = recarray[fieldname] + except KeyError: + raise KeyError("no such column: %s" % (fieldname,)) + return field + + + + +def read_f_attr(hid_t file_id, str attr_name): + """Read PyTables file attributes (i.e. in root group). + + Returns the value of the `attr_name` attribute in root group, or `None` + if it does not exist. This call cannot fail. + + """ + + cdef size_t size + cdef char *attr_value + cdef int cset = H5T_CSET_ASCII + cdef object retvalue + cdef bytes encoded_attr_name + cdef char *c_attr_name = NULL + + encoded_attr_name = attr_name.encode('utf-8') + # Get the C pointer + c_attr_name = encoded_attr_name + + attr_value = NULL + retvalue = None + # Check if attribute exists + if H5ATTRfind_attribute(file_id, c_attr_name): + # Read the attr_name attribute + size = H5ATTRget_attribute_string(file_id, c_attr_name, &attr_value, &cset) + if size == 0: + if cset == H5T_CSET_UTF8: + retvalue = numpy.unicode_('') + else: + retvalue = numpy.bytes_(b'') + else: + retvalue = (attr_value).rstrip(b'\x00') + if cset == H5T_CSET_UTF8: + retvalue = retvalue.decode('utf-8') + retvalue = numpy.str_(retvalue) + else: + retvalue = numpy.bytes_(retvalue) # bytes + + # Important to release attr_value, because it has been malloc'ed! + if attr_value: + free(attr_value) + + return retvalue + + +def get_filters(parent_id, name): + """Get a dictionary with the filter names and cd_values""" + + cdef bytes encoded_name + + encoded_name = name.encode('utf-8') + + return get_filter_names(parent_id, encoded_name) + + + + +# This is used by several ._convert_types() methods. +def get_type_enum(hid_t h5type): + """_getTypeEnum(h5type) -> hid_t + + Get the native HDF5 enumerated type of `h5type`. + + If `h5type` is an enumerated type, it is returned. If it is a + variable-length type with an enumerated base type, this is returned. If it + is a multi-dimensional type with an enumerated base type, this is returned. + Else, a ``TypeError`` is raised. + + """ + + cdef H5T_class_t typeClass + cdef hid_t enumId, enumId2 + + typeClass = H5Tget_class(h5type) + if typeClass < 0: + raise HDF5ExtError("failed to get class of HDF5 type") + + if typeClass == H5T_ENUM: + # Get the native type (in order to do byteorder conversions automatically) + enumId = H5Tget_native_type(h5type, H5T_DIR_DEFAULT) + elif typeClass in (H5T_ARRAY, H5T_VLEN): + # The field is multi-dimensional or variable length. + enumId2 = H5Tget_super(h5type) + enumId = get_type_enum(enumId2) + H5Tclose(enumId2) + else: + raise TypeError( + "enumerated values can not be stored using the given type") + return enumId + + + +def enum_from_hdf5(hid_t enumId, str byteorder): + """enum_from_hdf5(enumId) -> (Enum, npType) + + Convert an HDF5 enumerated type to a PyTables one. + + This function takes an HDF5 enumerated type and returns an `Enum` + instance built from that, and the NumPy type used to encode it. + + """ + + cdef hid_t baseId + cdef int nelems, npenum, i + cdef void *rbuf + cdef char *ename + cdef ndarray npvalue + cdef object dtype + cdef str pyename + + # Find the base type of the enumerated type, and get the atom + baseId = H5Tget_super(enumId) + atom = atom_from_hdf5_type(baseId) + H5Tclose(baseId) + if atom.kind not in ('int', 'uint'): + raise NotImplementedError("sorry, only integer concrete values are " + "supported at this moment") + + dtype = atom.dtype + npvalue = numpy.array((0,), dtype=dtype) + rbuf = PyArray_DATA(npvalue) + + # Get the name and value of each of the members + # and put the pair in `enumDict`. + enumDict = {} + + nelems = H5Tget_nmembers(enumId) + if enumId < 0: + raise HDF5ExtError( + "failed to get element count of HDF5 enumerated type") + + for i in range(nelems): + ename = H5Tget_member_name(enumId, i) + if ename == NULL: + raise HDF5ExtError( + "failed to get element name from HDF5 enumerated type") + + pyename = cstr_to_pystr(ename) + + pt_H5free_memory(ename) + + if H5Tget_member_value(enumId, i, rbuf) < 0: + raise HDF5ExtError( + "failed to get element value from HDF5 enumerated type") + + enumDict[pyename] = npvalue[0] # converted to NumPy scalar + + # Build an enumerated type from `enumDict` and return it. + return Enum(enumDict), dtype + + + + +def enum_to_hdf5(object enum_atom, str byteorder): + """Convert a PyTables enumerated type to an HDF5 one. + + This function creates an HDF5 enumerated type from the information + contained in `enumAtom` (an ``Atom`` object), with the specified + `byteorder` (a string). The resulting HDF5 enumerated type is + returned. + + """ + + cdef hid_t base_id, enum_id + cdef object base_atom + cdef ndarray values + + # Get the base HDF5 type and create the enumerated type. + base_atom = Atom.from_dtype(enum_atom.dtype.base) + base_id = atom_to_hdf5_type(base_atom, byteorder) + + try: + enum_id = H5Tenum_create(base_id) + if enum_id < 0: + raise HDF5ExtError("failed to create HDF5 enumerated type") + + finally: + if H5Tclose(base_id) < 0: + raise HDF5ExtError("failed to close HDF5 base type") + + try: + # Set the name and value of each of the members. + names = enum_atom._names + values = enum_atom._values + + # This saves the default enum value first so that we can restore it + default_name = enum_atom._defname + index_default = names.index(default_name) + H5Tenum_insert(enum_id, default_name.encode('utf-8'), + PyArray_GETPTR1(values, index_default)) + + for i, n in enumerate(names): + # Skip the default value as we have already inserted it before + if i == index_default: + continue + + if H5Tenum_insert(enum_id, n.encode('utf-8'), + PyArray_GETPTR1(values, i)) < 0: + raise HDF5ExtError("failed to insert value into HDF5 enumerated type") + + # Return the new, open HDF5 enumerated type. + return enum_id + + except: + if H5Tclose(enum_id) < 0: + raise HDF5ExtError("failed to close HDF5 enumerated type") + + raise + + +def atom_to_hdf5_type(atom, str byteorder): + cdef hid_t tid = -1 + cdef hid_t tid2 = -1 + cdef hsize_t *dims = NULL + cdef bytes encoded_byteorder + cdef char *cbyteorder = NULL + + encoded_byteorder = byteorder.encode('utf-8') + # Get the C pointer + cbyteorder = encoded_byteorder + + # Create the base HDF5 type + if atom.type in pttype_to_hdf5: + tid = H5Tcopy(pttype_to_hdf5[atom.type]) + # Fix the byteorder + if atom.kind != 'time': + set_order(tid, cbyteorder) + elif atom.type == 'float16': + tid = create_ieee_float16(cbyteorder) + elif atom.kind in pt_special_kinds: + # Special cases (the byteorder doesn't need to be fixed afterwards) + if atom.type == 'complex64': + tid = create_ieee_complex64(cbyteorder) + elif atom.type == 'complex128': + tid = create_ieee_complex128(cbyteorder) + elif atom.type == 'complex192': + tid = create_ieee_complex192(cbyteorder) + elif atom.type == 'complex256': + tid = create_ieee_complex256(cbyteorder) + elif atom.kind == 'string': + tid = H5Tcopy(H5T_C_S1); + H5Tset_size(tid, atom.itemsize) + elif atom.kind == 'bool': + tid = H5Tcopy(H5T_STD_B8); + elif atom.kind == 'enum': + tid = enum_to_hdf5(atom, byteorder) + else: + raise TypeError("Invalid type for atom %s" % (atom,)) + # Create an H5T_ARRAY in case of non-scalar atoms + if atom.shape != (): + dims = malloc_dims(atom.shape) + tid2 = H5Tarray_create(tid, len(atom.shape), dims) + free(dims) + H5Tclose(tid) + tid = tid2 + + return tid + + + + +def load_enum(hid_t type_id): + """load_enum() -> (Enum, npType) + + Load the enumerated HDF5 type associated with this type_id. + + It returns an `Enum` instance built from that, and the + NumPy type used to encode it. + + """ + + cdef hid_t enumId + cdef char c_byteorder[11] # "irrelevant" fits well here + cdef str byteorder + + # Get the enumerated type + enumId = get_type_enum(type_id) + + # Get the byteorder + get_order(type_id, c_byteorder) + byteorder = cstr_to_pystr(c_byteorder) + # Get the Enum and NumPy types and close the HDF5 type. + try: + return enum_from_hdf5(enumId, byteorder) + finally: + # (Yes, the ``finally`` clause *is* executed.) + if H5Tclose(enumId) < 0: + raise HDF5ExtError("failed to close HDF5 enumerated type") + + + +def hdf5_to_np_nested_type(hid_t type_id): + """Given a HDF5 `type_id`, return a dtype string representation of it.""" + + cdef hid_t member_type_id + cdef hid_t member_offset + cdef hsize_t nfields + cdef int i + cdef char *c_colname + cdef H5T_class_t class_id + cdef object desc + cdef str colname + + desc = {} + # Get the number of members + nfields = H5Tget_nmembers(type_id) + # Iterate thru the members + for i in range(nfields): + # Get the member name + c_colname = H5Tget_member_name(type_id, i) + colname = cstr_to_pystr(c_colname) + + # Get the member type + member_type_id = H5Tget_member_type(type_id, i) + member_offset = H5Tget_member_offset(type_id, i) + + # Get the HDF5 class + class_id = H5Tget_class(member_type_id) + if class_id == H5T_COMPOUND and not is_complex(member_type_id): + desc[colname] = hdf5_to_np_nested_type(member_type_id) + desc[colname]["_v_pos"] = i + desc[colname]["_v_offset"] = member_offset + else: + atom = atom_from_hdf5_type(member_type_id, pure_numpy_types=True) + desc[colname] = Col.from_atom(atom, pos=i, _offset=member_offset) + + # Release resources + H5Tclose(member_type_id) + pt_H5free_memory(c_colname) + + return desc + + + +def hdf5_to_np_ext_type(hid_t type_id, pure_numpy_types=True, atom=False, ptparams=None): + """Map the atomic HDF5 type to a string repr of NumPy extended codes. + + If `pure_numpy_types` is true, detected HDF5 types that does not match pure + NumPy types will raise a ``TypeError`` exception. If not, HDF5 types like + TIME, VLEN or ENUM are passed through. + + If `atom` is true, the resulting repr is meant for atoms. If not, the + result is meant for attributes. + + Returns the string repr of type and its shape. The exception is for + compounds types, that returns a NumPy dtype and shape instead. + + """ + + cdef H5T_sign_t sign + cdef hid_t super_type_id, native_type_id + cdef H5T_class_t class_id + cdef size_t itemsize + cdef object stype, shape, shape2 + cdef hsize_t *dims + + # default shape + shape = () + # Get the HDF5 class + class_id = H5Tget_class(type_id) + # Get the itemsize + itemsize = H5Tget_size(type_id) + + if class_id == H5T_BITFIELD: + stype = "b1" + elif class_id == H5T_INTEGER: + # Get the sign + sign = H5Tget_sign(type_id) + if sign > 0: + stype = "i%s" % itemsize + else: + stype = "u%s" % itemsize + elif class_id == H5T_FLOAT: + stype = "f%s" % itemsize + elif class_id == H5T_COMPOUND: + if is_complex(type_id): + stype = "c%s" % itemsize + else: + if atom: + raise TypeError("the HDF5 class ``%s`` is not supported yet" + % hdf5_class_to_string[class_id]) + desc = Description(hdf5_to_np_nested_type(type_id), ptparams=ptparams) + # stype here is not exactly a string, but the NumPy dtype factory + # will deal with this. + stype = desc._v_dtype + elif class_id == H5T_STRING: + if H5Tis_variable_str(type_id): + raise TypeError("variable length strings are not supported yet") + stype = "S%s" % itemsize + elif class_id == H5T_TIME: + if pure_numpy_types: + raise TypeError("the HDF5 class ``%s`` is not supported yet" + % hdf5_class_to_string[class_id]) + stype = "t%s" % itemsize + elif class_id == H5T_ENUM: + if pure_numpy_types: + raise TypeError("the HDF5 class ``%s`` is not supported yet" + % hdf5_class_to_string[class_id]) + stype = "e" + elif class_id == H5T_VLEN: + if pure_numpy_types: + raise TypeError("the HDF5 class ``%s`` is not supported yet" + % hdf5_class_to_string[class_id]) + # Get the variable length base component + super_type_id = H5Tget_super(type_id) + # Find the super member format + stype, shape = hdf5_to_np_ext_type(super_type_id, pure_numpy_types) + # Release resources + H5Tclose(super_type_id) + elif class_id == H5T_REFERENCE: + # only standard referenced objects (for atoms) are now supported + if not atom or not H5Tequal(type_id, H5T_STD_REF_OBJ): + raise TypeError("the HDF5 class ``%s`` is not supported yet" + % hdf5_class_to_string[class_id]) + stype = "_ref_" + elif class_id == H5T_ARRAY: + # Get the array base component + super_type_id = H5Tget_super(type_id) + # Find the super member format + stype, shape2 = hdf5_to_np_ext_type(super_type_id, pure_numpy_types) + # Get shape + shape = [] + ndims = H5Tget_array_ndims(type_id) + dims = malloc(ndims * sizeof(hsize_t)) + H5Tget_array_dims(type_id, dims) + for i in range(ndims): + shape.append(dims[i]) # cast to avoid long representation (i.e. 2L) + shape = tuple(shape) + # Release resources + free(dims) + H5Tclose(super_type_id) + else: + # Other types are not supported yet + raise TypeError("the HDF5 class ``%s`` is not supported yet" + % hdf5_class_to_string[class_id]) + + return stype, shape + + + + +def atom_from_hdf5_type(hid_t type_id, pure_numpy_types=False): + """Get an atom from a type_id. + + See `hdf5_to_np_ext_type` for an explanation of the `pure_numpy_types` + parameter. + + """ + + cdef object stype, shape, atom_, sctype, tsize, kind + cdef object dflt, base, enum_, nptype + + stype, shape = hdf5_to_np_ext_type(type_id, pure_numpy_types, atom=True) + # Create the Atom + if stype == '_ref_': + atom_ = ReferenceAtom(shape=shape) + elif stype == 'e': + (enum_, nptype) = load_enum(type_id) + # Take one of the names as the default in the enumeration. + dflt = next(iter(enum_))[0] + base = Atom.from_dtype(nptype) + atom_ = EnumAtom(enum_, dflt, base, shape=shape) + else: + kind = npext_prefixes_to_ptkinds[stype[0]] + tsize = int(stype[1:]) + atom_ = Atom.from_kind(kind, tsize, shape=shape) + + return atom_ + + + +def create_nested_type(object desc, str byteorder): + """Create a nested type based on a description and return an HDF5 type.""" + + cdef hid_t tid, tid2 + cdef size_t offset + cdef bytes encoded_name + + tid = H5Tcreate(H5T_COMPOUND, desc._v_itemsize) + if tid < 0: + return -1 + + offset = desc._v_offsets[0] if desc._v_offsets else 0 + for i, k in enumerate(desc._v_names): + obj = desc._v_colobjects[k] + if isinstance(obj, Description): + tid2 = create_nested_type(obj, byteorder) + else: + tid2 = atom_to_hdf5_type(obj, byteorder) + encoded_name = k.encode('utf-8') + if desc._v_offsets: + offset = desc._v_offsets[i] + H5Tinsert(tid, encoded_name, offset, tid2) + if not desc._v_offsets: + offset += desc._v_dtype[k].itemsize + # Release resources + H5Tclose(tid2) + + return tid + + +cdef int load_reference(hid_t dataset_id, hobj_ref_t *refbuf, size_t item_size, ndarray nparr) except -1: + """Load a reference as an array of objects + :param dataset_id: dataset of the reference + :param refbuf: load the references requested + :param item_size: size of the reference in the file read into refbuf + :param nparr: numpy object array already pre-allocated with right size and shape for refbuf references + """ + cdef size_t nelements = nparr.size + cdef int i, j + cdef hid_t refobj_id = -1 # if valid can be only be a dataset id + cdef hid_t reftype_id + cdef hid_t disk_type_id = -1 + cdef void *rbuf + cdef int rank = 0 + cdef hsize_t *maxdims = NULL + cdef hsize_t *dims = NULL + cdef char cbyteorder[11] + cdef H5T_class_t class_id + cdef hsize_t nrows + cdef ndarray nprefarr + cdef int extdim + cdef hobj_ref_t *newrefbuf = NULL + + + if refbuf == NULL: + raise ValueError("Invalid reference buffer") + + try: + + for i in range(nelements): + refobj_id = H5Rdereference(dataset_id, H5R_OBJECT, &refbuf[i]) + if H5Iget_type(refobj_id) != H5I_DATASET: + raise ValueError('Invalid reference type %d %d' % (H5Iget_type(refobj_id), item_size)) + disk_type_id = H5Dget_type(refobj_id) + reftype_id = get_native_type(disk_type_id) + # Get the rank for this array object + if H5ARRAYget_ndims(refobj_id, &rank) < 0: + raise HDF5ExtError("Problems getting ndims!") + + dims = malloc(rank * sizeof(hsize_t)) + maxdims = malloc(rank * sizeof(hsize_t)) + # Get info on dimensions, class and type (of base class) + ret = H5ARRAYget_info(refobj_id, disk_type_id, + dims, maxdims, + &class_id, cbyteorder) + if ret < 0: + raise HDF5ExtError("Unable to get array info.") + + # Get the extendable dimension (if any) + extdim = -1 # default is non-extensible Array + for j in range(rank): + if maxdims[j] == -1: + extdim = j + break + if extdim < 0: + extdim += rank + + nrows = dims[extdim] + + # read entire dataset as numpy array + stype_, shape_ = hdf5_to_np_ext_type(reftype_id, pure_numpy_types=True, atom=True) + if stype_ == "_ref_": + dtype_ = numpy.dtype("O", shape_) + else: + dtype_ = numpy.dtype(stype_, shape_) + shape = [] + for j in range(rank): + shape.append(dims[j]) + shape = tuple(shape) + + nprefarr = numpy.empty(dtype=dtype_, shape=shape) + nparr[i] = [nprefarr] # box the array in a list to store it as one object + if stype_ == "_ref_": + newrefbuf = malloc(nprefarr.size * item_size) + rbuf = newrefbuf + else: + rbuf = PyArray_DATA(nprefarr) + + # Do the physical read + with nogil: + ret = H5ARRAYread(refobj_id, reftype_id, 0, nrows, 1, extdim, rbuf) + if ret < 0: + raise HDF5ExtError("Problems reading the array data.") + + if stype_ == "_ref_": + # recurse to read the reference + load_reference(refobj_id, newrefbuf, item_size, nprefarr) + + # close objects + if newrefbuf: + free(newrefbuf) + newrefbuf = NULL + H5Oclose(refobj_id) + refobj_id = -1 + H5Tclose(reftype_id) + reftype_id = -1 + H5Tclose(disk_type_id) + disk_type_id = -1 + free(maxdims) + maxdims = NULL + free(dims) + dims = NULL + finally: + if newrefbuf: + free(newrefbuf) + newrefbuf = NULL + if refobj_id >= 0: + H5Oclose(refobj_id) + if reftype_id >= 0: + H5Tclose(reftype_id) + if disk_type_id >= 0: + H5Tclose(disk_type_id) + if maxdims: + free(maxdims) + if dims: + free(dims) + + # no error + return 0 + +## Local Variables: +## mode: python +## py-indent-offset: 2 +## tab-width: 2 +## fill-column: 78 +## End: diff --git a/tables/vlarray.py b/tables/vlarray.py new file mode 100644 index 0000000..e1b4b2c --- /dev/null +++ b/tables/vlarray.py @@ -0,0 +1,864 @@ +"""Here is defined the VLArray class.""" + +import operator +import sys +import numpy as np + +from . import hdf5extension +from .atom import ObjectAtom, VLStringAtom, VLUnicodeAtom +from .flavor import internal_to_flavor +from .leaf import Leaf, calc_chunksize +from .utils import ( + convert_to_np_atom, convert_to_np_atom2, idx2long, correct_byteorder, + SizeType, is_idx, lazyattr) + + +# default version for VLARRAY objects +# obversion = "1.0" # initial version +# obversion = "1.0" # add support for complex datatypes +# obversion = "1.1" # This adds support for time datatypes. +# obversion = "1.2" # This adds support for enumerated datatypes. +# obversion = "1.3" # Introduced 'PSEUDOATOM' attribute. +obversion = "1.4" # Numeric and numarray flavors are gone. + + +class VLArray(hdf5extension.VLArray, Leaf): + """This class represents variable length (ragged) arrays in an HDF5 file. + + Instances of this class represent array objects in the object tree + with the property that their rows can have a *variable* number of + homogeneous elements, called *atoms*. Like Table datasets (see + :ref:`TableClassDescr`), variable length arrays can have only one + dimension, and the elements (atoms) of their rows can be fully + multidimensional. + + When reading a range of rows from a VLArray, you will *always* get + a Python list of objects of the current flavor (each of them for a + row), which may have different lengths. + + This class provides methods to write or read data to or from + variable length array objects in the file. Note that it also + inherits all the public attributes and methods that Leaf (see + :ref:`LeafClassDescr`) already provides. + + .. note:: + + VLArray objects also support compression although compression + is only performed on the data structures used internally by + the HDF5 to take references of the location of the variable + length data. Data itself (the raw data) are not compressed + or filtered. + + Please refer to the `VLTypes Technical Note + `_ + for more details on the topic. + + Parameters + ---------- + parentnode + The parent :class:`Group` object. + name : str + The name of this node in its parent group. + atom + An `Atom` instance representing the *type* and *shape* of the atomic + objects to be saved. + title + A description for this node (it sets the ``TITLE`` HDF5 attribute on + disk). + filters + An instance of the `Filters` class that provides information about the + desired I/O filters to be applied during the life of this object. + expectedrows + A user estimate about the number of row elements that will + be added to the growable dimension in the `VLArray` node. + If not provided, the default value is ``EXPECTED_ROWS_VLARRAY`` + (see ``tables/parameters.py``). If you plan to create either + a much smaller or a much bigger `VLArray` try providing a guess; + this will optimize the HDF5 B-Tree creation and management + process time and the amount of memory used. + + .. versionadded:: 3.0 + + chunkshape + The shape of the data chunk to be read or written in a single HDF5 I/O + operation. Filters are applied to those chunks of data. The + dimensionality of `chunkshape` must be 1. If ``None``, a sensible + value is calculated (which is recommended). + byteorder + The byteorder of the data *on disk*, specified as 'little' or 'big'. + If this is not specified, the byteorder is that of the platform. + + track_times + Whether time data associated with the leaf are recorded (object + access time, raw data modification time, metadata change time, object + birth time); default True. Semantics of these times depend on their + implementation in the HDF5 library: refer to documentation of the + H5O_info_t data structure. As of HDF5 1.8.15, only ctime (metadata + change time) is implemented. + + .. versionadded:: 3.4.3 + + + .. versionchanged:: 3.0 + *parentNode* renamed into *parentnode*. + + .. versionchanged:: 3.0 + The *expectedsizeinMB* parameter has been replaced by *expectedrows*. + + Examples + -------- + See below a small example of the use of the VLArray class. The code is + available in :file:`examples/vlarray1.py`:: + + import numpy as np + import tables as tb + + # Create a VLArray: + fileh = tb.open_file('vlarray1.h5', mode='w') + vlarray = fileh.create_vlarray( + fileh.root, + 'vlarray1', + tb.Int32Atom(shape=()), + "ragged array of ints", + filters=tb.Filters(1)) + + # Append some (variable length) rows: + vlarray.append(np.array([5, 6])) + vlarray.append(np.array([5, 6, 7])) + vlarray.append([5, 6, 9, 8]) + + # Now, read it through an iterator: + print('-->', vlarray.title) + for x in vlarray: + print('%s[%d]--> %s' % (vlarray.name, vlarray.nrow, x)) + + # Now, do the same with native Python strings. + vlarray2 = fileh.create_vlarray( + fileh.root, + 'vlarray2', + tb.StringAtom(itemsize=2), + "ragged array of strings", + filters=tb.Filters(1)) + vlarray2.flavor = 'python' + + # Append some (variable length) rows: + print('-->', vlarray2.title) + vlarray2.append(['5', '66']) + vlarray2.append(['5', '6', '77']) + vlarray2.append(['5', '6', '9', '88']) + + # Now, read it through an iterator: + for x in vlarray2: + print('%s[%d]--> %s' % (vlarray2.name, vlarray2.nrow, x)) + + # Close the file. + fileh.close() + + The output for the previous script is something like:: + + --> ragged array of ints + vlarray1[0]--> [5 6] + vlarray1[1]--> [5 6 7] + vlarray1[2]--> [5 6 9 8] + --> ragged array of strings + vlarray2[0]--> ['5', '66'] + vlarray2[1]--> ['5', '6', '77'] + vlarray2[2]--> ['5', '6', '9', '88'] + + + .. rubric:: VLArray attributes + + The instance variables below are provided in addition to those in + Leaf (see :ref:`LeafClassDescr`). + + .. attribute:: atom + + An Atom (see :ref:`AtomClassDescr`) + instance representing the *type* and + *shape* of the atomic objects to be + saved. You may use a *pseudo-atom* for + storing a serialized object or variable length string per row. + + .. attribute:: flavor + + The type of data object read from this leaf. + + Please note that when reading several rows of VLArray data, + the flavor only applies to the *components* of the returned + Python list, not to the list itself. + + .. attribute:: nrow + + On iterators, this is the index of the current row. + + .. attribute:: nrows + + The current number of rows in the array. + + .. attribute:: extdim + + The index of the enlargeable dimension (always 0 for vlarrays). + + """ + + # Class identifier. + _c_classid = 'VLARRAY' + + @lazyattr + def dtype(self): + """The NumPy ``dtype`` that most closely matches this array.""" + return self.atom.dtype + + @property + def shape(self): + """The shape of the stored array.""" + return (self.nrows,) + + @property + def size_on_disk(self): + """ + The HDF5 library does not include a function to determine size_on_disk + for variable-length arrays. Accessing this attribute will raise a + NotImplementedError. + """ + raise NotImplementedError('size_on_disk not implemented for VLArrays') + + @property + def size_in_memory(self): + """ + The size of this array's data in bytes when it is fully loaded + into memory. + + .. note:: + + When data is stored in a VLArray using the ObjectAtom type, + it is first serialized using pickle, and then converted to + a NumPy array suitable for storage in an HDF5 file. + This attribute will return the size of that NumPy + representation. If you wish to know the size of the Python + objects after they are loaded from disk, you can use this + `ActiveState recipe + `_. + """ + return self._get_memory_size() + + def __init__(self, parentnode, name, atom=None, title="", + filters=None, expectedrows=None, + chunkshape=None, byteorder=None, + _log=True, track_times=True): + + self._v_version = None + """The object version of this array.""" + + self._v_new = new = atom is not None + """Is this the first time the node has been created?""" + + self._v_new_title = title + """New title for this node.""" + + self._v_new_filters = filters + """New filter properties for this array.""" + + if expectedrows is None: + expectedrows = parentnode._v_file.params['EXPECTED_ROWS_VLARRAY'] + self._v_expectedrows = expectedrows + """The expected number of rows to be stored in the array. + + .. versionadded:: 3.0 + + """ + + self._v_chunkshape = None + """Private storage for the `chunkshape` property of Leaf.""" + + # Miscellaneous iteration rubbish. + self._start = None + """Starting row for the current iteration.""" + + self._stop = None + """Stopping row for the current iteration.""" + + self._step = None + """Step size for the current iteration.""" + + self._nrowsread = None + """Number of rows read up to the current state of iteration.""" + + self._startb = None + """Starting row for current buffer.""" + + self._stopb = None + """Stopping row for current buffer. """ + + self._row = None + """Current row in iterators (sentinel).""" + + self._init = False + """Whether we are in the middle of an iteration or not (sentinel).""" + + self.listarr = None + """Current buffer in iterators.""" + + # Documented (*public*) attributes. + self.atom = atom + """ + An Atom (see :ref:`AtomClassDescr`) instance representing the + *type* and *shape* of the atomic objects to be saved. You may + use a *pseudo-atom* for storing a serialized object or + variable length string per row. + """ + self.nrow = None + """On iterators, this is the index of the current row.""" + + self.nrows = None + """The current number of rows in the array.""" + + self.extdim = 0 # VLArray only have one dimension currently + """The index of the enlargeable dimension (always 0 for vlarrays).""" + + # Check the chunkshape parameter + if new and chunkshape is not None: + if isinstance(chunkshape, (int, np.integer)): + chunkshape = (chunkshape,) + try: + chunkshape = tuple(chunkshape) + except TypeError: + raise TypeError( + "`chunkshape` parameter must be an integer or sequence " + "and you passed a %s" % type(chunkshape)) + if len(chunkshape) != 1: + raise ValueError("`chunkshape` rank (length) must be 1: %r" + % (chunkshape,)) + self._v_chunkshape = tuple(SizeType(s) for s in chunkshape) + + super().__init__(parentnode, name, new, filters, + byteorder, _log, track_times) + + def _g_post_init_hook(self): + super()._g_post_init_hook() + self.nrowsinbuf = 100 # maybe enough for most applications + + # This is too specific for moving it into Leaf + def _calc_chunkshape(self, expectedrows): + """Calculate the size for the HDF5 chunk.""" + + # For computing the chunkshape for HDF5 VL types, we have to + # choose the itemsize of the *each* element of the atom and + # not the size of the entire atom. I don't know why this + # should be like this, perhaps I should report this to the + # HDF5 list. + # F. Alted 2006-11-23 + # elemsize = self.atom.atomsize() + elemsize = self._basesize + + # AV 2013-05-03 + # This is just a quick workaround tha allows to change the API for + # PyTables 3.0 release and remove the expected_mb parameter. + # The algorithm for computing the chunkshape should be rewritten as + # requested by gh-35. + expected_mb = expectedrows * elemsize / 1024 ** 2 + + chunksize = calc_chunksize(expected_mb) + + # Set the chunkshape + chunkshape = chunksize // elemsize + # Safeguard against itemsizes being extremely large + if chunkshape == 0: + chunkshape = 1 + return (SizeType(chunkshape),) + + def _g_create(self): + """Create a variable length array (ragged array).""" + + atom = self.atom + self._v_version = obversion + # Check for zero dims in atom shape (not allowed in VLArrays) + zerodims = np.sum(np.array(atom.shape) == 0) + if zerodims > 0: + raise ValueError("When creating VLArrays, none of the dimensions " + "of the Atom instance can be zero.") + + if not hasattr(atom, 'size'): # it is a pseudo-atom + self._atomicdtype = atom.base.dtype + self._atomicsize = atom.base.size + self._basesize = atom.base.itemsize + else: + self._atomicdtype = atom.dtype + self._atomicsize = atom.size + self._basesize = atom.itemsize + self._atomictype = atom.type + self._atomicshape = atom.shape + + # Compute the optimal chunkshape, if needed + if self._v_chunkshape is None: + self._v_chunkshape = self._calc_chunkshape(self._v_expectedrows) + + self.nrows = SizeType(0) # No rows at creation time + + # Correct the byteorder if needed + if self.byteorder is None: + self.byteorder = correct_byteorder(atom.type, sys.byteorder) + + # After creating the vlarray, ``self._v_objectid`` needs to be + # set because it is needed for setting attributes afterwards. + self._v_objectid = self._create_array(self._v_new_title) + + # Add an attribute in case we have a pseudo-atom so that we + # can retrieve the proper class after a re-opening operation. + if not hasattr(atom, 'size'): # it is a pseudo-atom + self.attrs.PSEUDOATOM = atom.kind + + return self._v_objectid + + def _g_open(self): + """Get the metadata info for an array in file.""" + + self._v_objectid, self.nrows, self._v_chunkshape, atom = \ + self._open_array() + + # Check if the atom can be a PseudoAtom + if "PSEUDOATOM" in self.attrs: + kind = self.attrs.PSEUDOATOM + if kind == 'vlstring': + atom = VLStringAtom() + elif kind == 'vlunicode': + atom = VLUnicodeAtom() + elif kind == 'object': + atom = ObjectAtom() + else: + raise ValueError( + "pseudo-atom name ``%s`` not known." % kind) + elif self._v_file.format_version[:1] == "1": + flavor1x = self.attrs.FLAVOR + if flavor1x == "VLString": + atom = VLStringAtom() + elif flavor1x == "Object": + atom = ObjectAtom() + + self.atom = atom + return self._v_objectid + + def _getnobjects(self, nparr): + """Return the number of objects in a NumPy array.""" + + # Check for zero dimensionality array + zerodims = np.sum(np.array(nparr.shape) == 0) + if zerodims > 0: + # No objects to be added + return 0 + shape = nparr.shape + atom_shape = self.atom.shape + shapelen = len(nparr.shape) + if isinstance(atom_shape, tuple): + atomshapelen = len(self.atom.shape) + else: + atom_shape = (self.atom.shape,) + atomshapelen = 1 + diflen = shapelen - atomshapelen + if shape == atom_shape: + nobjects = 1 + elif (diflen == 1 and shape[diflen:] == atom_shape): + # Check if the leading dimensions are all ones + # if shape[:diflen-1] == (1,)*(diflen-1): + # nobjects = shape[diflen-1] + # shape = shape[diflen:] + # It's better to accept only inputs with the exact dimensionality + # i.e. a dimensionality only 1 element larger than atom + nobjects = shape[0] + shape = shape[1:] + elif atom_shape == (1,) and shapelen == 1: + # Case where shape = (N,) and shape_atom = 1 or (1,) + nobjects = shape[0] + else: + raise ValueError("The object '%s' is composed of elements with " + "shape '%s', which is not compatible with the " + "atom shape ('%s')." % (nparr, shape, atom_shape)) + return nobjects + + def get_enum(self): + """Get the enumerated type associated with this array. + + If this array is of an enumerated type, the corresponding Enum instance + (see :ref:`EnumClassDescr`) is returned. If it is not of an enumerated + type, a TypeError is raised. + + """ + + if self.atom.kind != 'enum': + raise TypeError("array ``%s`` is not of an enumerated type" + % self._v_pathname) + + return self.atom.enum + + def append(self, sequence): + """Add a sequence of data to the end of the dataset. + + This method appends the objects in the sequence to a *single row* in + this array. The type and shape of individual objects must be compliant + with the atoms in the array. In the case of serialized objects and + variable length strings, the object or string to append is itself the + sequence. + + """ + + self._g_check_open() + self._v_file._check_writable() + + # Prepare the sequence to convert it into a NumPy object + atom = self.atom + if not hasattr(atom, 'size'): # it is a pseudo-atom + sequence = atom.toarray(sequence) + statom = atom.base + else: + try: # fastest check in most cases + len(sequence) + except TypeError: + raise TypeError("argument is not a sequence") + statom = atom + + if len(sequence) > 0: + # The sequence needs to be copied to make the operation safe + # to in-place conversion. + nparr = convert_to_np_atom2(sequence, statom) + nobjects = self._getnobjects(nparr) + else: + nobjects = 0 + nparr = None + + self._append(nparr, nobjects) + self.nrows += 1 + + def iterrows(self, start=None, stop=None, step=None): + """Iterate over the rows of the array. + + This method returns an iterator yielding an object of the current + flavor for each selected row in the array. + + If a range is not supplied, *all the rows* in the array are iterated + upon. You can also use the :meth:`VLArray.__iter__` special method for + that purpose. If you only want to iterate over a given *range of rows* + in the array, you may use the start, stop and step parameters. + + Examples + -------- + + :: + + for row in vlarray.iterrows(step=4): + print('%s[%d]--> %s' % (vlarray.name, vlarray.nrow, row)) + + .. versionchanged:: 3.0 + If the *start* parameter is provided and *stop* is None then the + array is iterated from *start* to the last line. + In PyTables < 3.0 only one element was returned. + + """ + + (self._start, self._stop, self._step) = self._process_range( + start, stop, step) + self._init_loop() + return self + + def __iter__(self): + """Iterate over the rows of the array. + + This is equivalent to calling :meth:`VLArray.iterrows` with default + arguments, i.e. it iterates over *all the rows* in the array. + + Examples + -------- + + :: + + result = [row for row in vlarray] + + Which is equivalent to:: + + result = [row for row in vlarray.iterrows()] + + """ + + if not self._init: + # If the iterator is called directly, assign default variables + self._start = 0 + self._stop = self.nrows + self._step = 1 + # and initialize the loop + self._init_loop() + + return self + + def _init_loop(self): + """Initialization for the __iter__ iterator.""" + + self._nrowsread = self._start + self._startb = self._start + self._row = -1 # Sentinel + self._init = True # Sentinel + self.nrow = SizeType(self._start - self._step) # row number + + def __next__(self): + """Get the next element of the array during an iteration. + + The element is returned as a list of objects of the current + flavor. + + """ + + if self._nrowsread >= self._stop: + self._init = False + raise StopIteration # end of iteration + else: + # Read a chunk of rows + if self._row + 1 >= self.nrowsinbuf or self._row < 0: + self._stopb = self._startb + self._step * self.nrowsinbuf + self.listarr = self.read(self._startb, self._stopb, self._step) + self._row = -1 + self._startb = self._stopb + self._row += 1 + self.nrow += self._step + self._nrowsread += self._step + return self.listarr[self._row] + + def __getitem__(self, key): + """Get a row or a range of rows from the array. + + If key argument is an integer, the corresponding array row is returned + as an object of the current flavor. If key is a slice, the range of + rows determined by it is returned as a list of objects of the current + flavor. + + In addition, NumPy-style point selections are supported. In + particular, if key is a list of row coordinates, the set of rows + determined by it is returned. Furthermore, if key is an array of + boolean values, only the coordinates where key is True are returned. + Note that for the latter to work it is necessary that key list would + contain exactly as many rows as the array has. + + Examples + -------- + + :: + + a_row = vlarray[4] + a_list = vlarray[4:1000:2] + a_list2 = vlarray[[0,2]] # get list of coords + a_list3 = vlarray[[0,-2]] # negative values accepted + a_list4 = vlarray[numpy.array([True,...,False])] # array of bools + + """ + + self._g_check_open() + if is_idx(key): + key = operator.index(key) + + # Index out of range protection + if key >= self.nrows: + raise IndexError("Index out of range") + if key < 0: + # To support negative values + key += self.nrows + (start, stop, step) = self._process_range(key, key + 1, 1) + return self.read(start, stop, step)[0] + elif isinstance(key, slice): + start, stop, step = self._process_range( + key.start, key.stop, key.step) + return self.read(start, stop, step) + # Try with a boolean or point selection + elif type(key) in (list, tuple) or isinstance(key, np.ndarray): + coords = self._point_selection(key) + return self._read_coordinates(coords) + else: + raise IndexError(f"Invalid index or slice: {key!r}") + + def _assign_values(self, coords, values): + """Assign the `values` to the positions stated in `coords`.""" + + for nrow, value in zip(coords, values): + if nrow >= self.nrows: + raise IndexError("First index out of range") + if nrow < 0: + # To support negative values + nrow += self.nrows + object_ = value + # Prepare the object to convert it into a NumPy object + atom = self.atom + if not hasattr(atom, 'size'): # it is a pseudo-atom + object_ = atom.toarray(object_) + statom = atom.base + else: + statom = atom + value = convert_to_np_atom(object_, statom) + nobjects = self._getnobjects(value) + + # Get the previous value + nrow = idx2long( + nrow) # To convert any possible numpy scalar value + nparr = self._read_array(nrow, nrow + 1, 1)[0] + nobjects = len(nparr) + if len(value) > nobjects: + raise ValueError("Length of value (%s) is larger than number " + "of elements in row (%s)" % (len(value), + nobjects)) + try: + nparr[:] = value + except Exception as exc: # XXX + raise ValueError("Value parameter:\n'%r'\n" + "cannot be converted into an array object " + "compliant vlarray[%s] row: \n'%r'\n" + "The error was: <%s>" % (value, nrow, + nparr[:], exc)) + + if nparr.size > 0: + self._modify(nrow, nparr, nobjects) + + def __setitem__(self, key, value): + """Set a row, or set of rows, in the array. + + It takes different actions depending on the type of the *key* + parameter: if it is an integer, the corresponding table row is + set to *value* (a record or sequence capable of being converted + to the table structure). If *key* is a slice, the row slice + determined by it is set to *value* (a record array or sequence + of rows capable of being converted to the table structure). + + In addition, NumPy-style point selections are supported. In + particular, if key is a list of row coordinates, the set of rows + determined by it is set to value. Furthermore, if key is an array of + boolean values, only the coordinates where key is True are set to + values from value. Note that for the latter to work it is necessary + that key list would contain exactly as many rows as the table has. + + .. note:: + + When updating the rows of a VLArray object which uses a + pseudo-atom, there is a problem: you can only update values + with *exactly* the same size in bytes than the original row. + This is very difficult to meet with object pseudo-atoms, + because :mod:`pickle` applied on a Python object does not + guarantee to return the same number of bytes than over another + object, even if they are of the same class. + This effectively limits the kinds of objects than can be + updated in variable-length arrays. + + Examples + -------- + + :: + + vlarray[0] = vlarray[0] * 2 + 3 + vlarray[99] = arange(96) * 2 + 3 + + # Negative values for the index are supported. + vlarray[-99] = vlarray[5] * 2 + 3 + vlarray[1:30:2] = list_of_rows + vlarray[[1,3]] = new_1_and_3_rows + + """ + + self._g_check_open() + self._v_file._check_writable() + + if is_idx(key): + # If key is not a sequence, convert to it + coords = [key] + value = [value] + elif isinstance(key, slice): + start, stop, step = self._process_range( + key.start, key.stop, key.step) + coords = range(start, stop, step) + # Try with a boolean or point selection + elif type(key) in (list, tuple) or isinstance(key, np.ndarray): + coords = self._point_selection(key) + else: + raise IndexError(f"Invalid index or slice: {key!r}") + + # Do the assignment row by row + self._assign_values(coords, value) + + # Accessor for the _read_array method in superclass + def read(self, start=None, stop=None, step=1): + """Get data in the array as a list of objects of the current flavor. + + Please note that, as the lengths of the different rows are variable, + the returned value is a *Python list* (not an array of the current + flavor), with as many entries as specified rows in the range + parameters. + + The start, stop and step parameters can be used to select only a + *range of rows* in the array. Their meanings are the same as in + the built-in range() Python function, except that negative values + of step are not allowed yet. Moreover, if only start is specified, + then stop will be set to start + 1. If you do not specify neither + start nor stop, then *all the rows* in the array are selected. + + """ + + self._g_check_open() + start, stop, step = self._process_range_read(start, stop, step) + if start == stop: + listarr = [] + else: + listarr = self._read_array(start, stop, step) + + atom = self.atom + if not hasattr(atom, 'size'): # it is a pseudo-atom + outlistarr = [atom.fromarray(arr) for arr in listarr] + else: + # Convert the list to the right flavor + flavor = self.flavor + outlistarr = [internal_to_flavor(arr, flavor) for arr in listarr] + return outlistarr + + def _read_coordinates(self, coords): + """Read rows specified in `coords`.""" + rows = [] + for coord in coords: + rows.append(self.read(int(coord), int(coord) + 1, 1)[0]) + return rows + + def _g_copy_with_stats(self, group, name, start, stop, step, + title, filters, chunkshape, _log, **kwargs): + """Private part of Leaf.copy() for each kind of leaf.""" + + # Build the new VLArray object + object = VLArray( + group, name, self.atom, title=title, filters=filters, + expectedrows=self._v_expectedrows, chunkshape=chunkshape, + _log=_log) + + # Now, fill the new vlarray with values from the old one + # This is not buffered because we cannot forsee the length + # of each record. So, the safest would be a copy row by row. + # In the future, some analysis can be done in order to buffer + # the copy process. + nrowsinbuf = 1 + (start, stop, step) = self._process_range_read(start, stop, step) + # Optimized version (no conversions, no type and shape checks, etc...) + nrowscopied = SizeType(0) + nbytes = 0 + if not hasattr(self.atom, 'size'): # it is a pseudo-atom + atomsize = self.atom.base.size + else: + atomsize = self.atom.size + for start2 in range(start, stop, step * nrowsinbuf): + # Save the records on disk + stop2 = start2 + step * nrowsinbuf + if stop2 > stop: + stop2 = stop + nparr = self._read_array(start=start2, stop=stop2, step=step)[0] + nobjects = nparr.shape[0] + object._append(nparr, nobjects) + nbytes += nobjects * atomsize + nrowscopied += 1 + object.nrows = nrowscopied + return (object, nbytes) + + def __repr__(self): + """This provides more metainfo in addition to standard __str__""" + + return f"""{self} + atom = {self.atom!r} + byteorder = {self.byteorder!r} + nrows = {self.nrows} + flavor = {self.flavor!r}""" diff --git a/utils/pt2to3 b/utils/pt2to3 new file mode 100755 index 0000000..d236af8 --- /dev/null +++ b/utils/pt2to3 @@ -0,0 +1,3 @@ +#!/usr/bin/env python +from tables.scripts.pt2to3 import main +main() diff --git a/utils/ptdump b/utils/ptdump new file mode 100755 index 0000000..f1ebf6d --- /dev/null +++ b/utils/ptdump @@ -0,0 +1,3 @@ +#!/usr/bin/env python +from tables.scripts.ptdump import main +main() diff --git a/utils/ptrepack b/utils/ptrepack new file mode 100755 index 0000000..13ffa8b --- /dev/null +++ b/utils/ptrepack @@ -0,0 +1,3 @@ +#!/usr/bin/env python +from tables.scripts.ptrepack import main +main() diff --git a/utils/pttree b/utils/pttree new file mode 100755 index 0000000..ff4191c --- /dev/null +++ b/utils/pttree @@ -0,0 +1,3 @@ +#!/usr/bin/env python +from tables.scripts.pttree import main +main()